Mental Health Treatment Prediction Language Optimization

Lately, I've taken interest to modern psychiatric practices as a means for getting closer to solving the issue of mental health. Aside from novel medical practice, I see massive potential with language processing in combination with data analysis. With this project I looked into the OSMI Mental Health in Tech dataset and used it to make predictions on whether respondents sought out treatment. Using a TensorFlow ANN regressor on processed survey data the model accuracy hovered around approxiamately 67 to 69%. However, this doesn't take into account natural language data in the 'comments' column. The low model accuracy points to a proposition that the comment text contains hidden motives for treatment, which I explore next. 

View Code

import numpy as np  # 1.24.1

import pandas as pd  # 1.5.2

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import tensorflow as tf  # 2.11.0

data = pd.read_csv('survey.csv')


# CLEANING

langdata = data.loc[:, "comments"# save comments to var langdata

langdata = langdata.dropna()


data = data.drop('comments', axis=1)

# data['self_emplyed'].unique()  -> unique is NaN, Yes, No

# data['self_emplyed'].mode()  -> 'No' is most common

data['self_employed'] = data['self_employed'].fillna('No')

data['work_interfere'] = data['work_interfere'].fillna('Sometimes')


# FEATURE ENGINEERING

data['Year'] = data['Timestamp'].apply(lambda x: np.int64(x[0:4]))

data['Month'] = data['Timestamp'].apply(lambda x: np.int64(x[5:7]))

data['Day'] = data['Timestamp'].apply(lambda x: np.int64(x[8:10]))

data['Hour'] = data['Timestamp'].apply(lambda x: np.int64(x[11:13]))

data['Minute'] = data['Timestamp'].apply(lambda x: np.int64(x[14:16]))

data['Second'] = data['Timestamp'].apply(lambda x: np.int64(x[17:19]))

data = data.drop('Timestamp', axis=1)


# ENCODING FEATURES

{column: len(data[column].unique())

 for column in data.select_dtypes('object').columns# hash for unique values

{column: list(data[column].unique())

 for column in data.select_dtypes('object').columns# format to list


def encode_gender(x):   # 'gender' was unstructured entry

    if x.lower()[0] == 'f': return 0

    elif x.lower()[0] == 'm': return 1

    else: return 2


data['Gender'] = data['Gender'].apply(encode_gender)

target = 'treatment'


binary_features = [

    'self_employed',

    'family_history',

    'remote_work',

    'tech_company',

    'obs_consequence']

ordinal_features = [

    'work_interfere',

    'no_employees']

nominal_features = [

    'Country',

    'state',

    'benefits',

    'care_options',

    'wellness_program',

    'seek_help',

    'anonymity',

    'leave',

    'mental_health_consequence',

    'phys_health_consequence',

    'coworkers',

    'supervisor',

    'mental_health_interview',

    'phys_health_interview',

    'mental_vs_physical', ]


def binary_encode(df, columns, positive_values):

    df = df.copy()

    for column, positive_value in zip(columns, positive_values):

        df[column] = df[column].apply(

            lambda x: 1 if x == positive_value else 0)

    return df

def ordinal_encode(df, columns, orderings):

    df = df.copy()

    for column, ordering in zip(columns, orderings):

        df[column] = df[column].apply(lambda x: ordering.index(x))

    return df

def onehot_encode(df, columns, prefixes):

    df = df.copy()

    for column, prefix in zip(columns, prefixes):

        dummies = pd.get_dummies(df[column], prefix)

        df = pd.concat([df, dummies], axis=1)

        df = df.drop(column, axis=1)

    return df


binary_positive_values = ['Yes' for feature in binary_features]

ordinal_orderings = [

    ['Never', 'Rarely', 'Sometimes', 'Often'],

    ['1-5', '6-25', '26-100', '100-500', '500-1000', 'More than 1000']]

nominal_prefixes = ['co', 'st', 're', 'be', 'ca', 'we',

                    'se', 'an', 'le', 'mc', 'ph', 'cw', 'su', 'mi', 'pi', 'mp']

# Encoding Operations

data = binary_encode(

    data,

    columns=binary_features,

    positive_values=binary_positive_values)

data = ordinal_encode(

    data,

    columns=ordinal_features,

    orderings=ordinal_orderings

data = onehot_encode(

    data,

    columns=nominal_features,

    prefixes=nominal_prefixes)


# Encoding 'Labels'

data = binary_encode(data, columns=['treatment'], positive_values=['Yes'])


# SPLITTING/SCALING

y = data['treatment'].copy()

X = data.drop('treatment', axis=1).copy()

scaler = StandardScaler()  # mean and variance for each column

X = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split# derive train/test set

    X, y, train_size=0.7, random_state=100)


# TRAINING

inputs = tf.keras.Input(shape=(X.shape[1],))  # number of features

x = tf.keras.layers.Dense(1024, activation='relu')(inputs)

x = tf.keras.layers.Dense(1024, activation='relu')(x)

outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)


model = tf.keras.Model(inputs, outputs)

model.compile(

    optimizer='adam',

    loss='binary_crossentropy',

    metrics=[

        'accuracy',

        tf.keras.metrics.AUC(name='auc'# Class Distribution is 49.7% / 50.3% ])

batch_size = 64

epochs = 50


history = model.fit(

    X_train,

    y_train,

    validation_split=0.2,

    batch_size=batch_size,

    epochs=epochs,

    callbacks=[

        # checks validation loss improvement against epochs -> lowers Learning Rate to allow model to converge

        # forces Validation accuracy to plateau when model starts to overfit

        tf.keras.callbacks.ReduceLROnPlateau() ])


# PLOTTING

plt.figure(figsize=(12, 6))

plt.plot(range(epochs), history.history['accuracy'], label="Training Accuracy")

plt.plot(range(epochs),  # progression of metric 'accuracy' over each epoch

         history.history['val_accuracy'], label="Validation Accuracy")

plt.xlabel("Epoch")

plt.ylabel("Accuracy")

plt.legend()

plt.title("Accuracy Over Time")

plt.show()