Mental Health Treatment Prediction Language Optimization

Lately, I've taken interest to modern psychiatric practices as a means for getting closer to solving the issue of mental health. Aside from novel medical practice, I see massive potential with language processing in combination with data analysis. With this project I looked into the OSMI Mental Health in Tech dataset and used it to make predictions on whether respondents sought out treatment. Using a TensorFlow ANN regressor on processed survey data the model accuracy hovered around approxiamately 67 to 69%. However, this doesn't take into account natural language data in the 'comments' column. The low model accuracy points to a proposition that the comment text contains hidden motives for treatment, which I explore next.

View Code

import numpy as np # 1.24.1

import pandas as pd # 1.5.2

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import tensorflow as tf # 2.11.0

data = pd.read_csv('survey.csv')

# CLEANING

langdata = data.loc[:, "comments"] # save comments to var langdata

langdata = langdata.dropna()

data = data.drop('comments', axis=1)

# data['self_emplyed'].unique() -> unique is NaN, Yes, No

# data['self_emplyed'].mode() -> 'No' is most common

data['self_employed'] = data['self_employed'].fillna('No')

data['work_interfere'] = data['work_interfere'].fillna('Sometimes')

# FEATURE ENGINEERING

data['Year'] = data['Timestamp'].apply(lambda x: np.int64(x[0:4]))

data['Month'] = data['Timestamp'].apply(lambda x: np.int64(x[5:7]))

data['Day'] = data['Timestamp'].apply(lambda x: np.int64(x[8:10]))

data['Hour'] = data['Timestamp'].apply(lambda x: np.int64(x[11:13]))

data['Minute'] = data['Timestamp'].apply(lambda x: np.int64(x[14:16]))

data['Second'] = data['Timestamp'].apply(lambda x: np.int64(x[17:19]))

data = data.drop('Timestamp', axis=1)

# ENCODING FEATURES

{column: len(data[column].unique())

for column in data.select_dtypes('object').columns} # hash for unique values

{column: list(data[column].unique())

for column in data.select_dtypes('object').columns} # format to list

def encode_gender(x): # 'gender' was unstructured entry

if x.lower()[0] == 'f': return 0

elif x.lower()[0] == 'm': return 1

else: return 2

data['Gender'] = data['Gender'].apply(encode_gender)

target = 'treatment'

binary_features = [

'self_employed',

'family_history',

'remote_work',

'tech_company',

'obs_consequence']

ordinal_features = [

'work_interfere',

'no_employees']

nominal_features = [

'Country',

'state',

'benefits',

'care_options',

'wellness_program',

'seek_help',

'anonymity',

'leave',

'mental_health_consequence',

'phys_health_consequence',

'coworkers',

'supervisor',

'mental_health_interview',

'phys_health_interview',

'mental_vs_physical', ]

def binary_encode(df, columns, positive_values):

df = df.copy()

for column, positive_value in zip(columns, positive_values):

df[column] = df[column].apply(

lambda x: 1 if x == positive_value else 0)

return df

def ordinal_encode(df, columns, orderings):

df = df.copy()

for column, ordering in zip(columns, orderings):

df[column] = df[column].apply(lambda x: ordering.index(x))

return df

def onehot_encode(df, columns, prefixes):

df = df.copy()

for column, prefix in zip(columns, prefixes):

dummies = pd.get_dummies(df[column], prefix)

df = pd.concat([df, dummies], axis=1)

df = df.drop(column, axis=1)

return df

binary_positive_values = ['Yes' for feature in binary_features]

ordinal_orderings = [

['Never', 'Rarely', 'Sometimes', 'Often'],

['1-5', '6-25', '26-100', '100-500', '500-1000', 'More than 1000']]

nominal_prefixes = ['co', 'st', 're', 'be', 'ca', 'we',

'se', 'an', 'le', 'mc', 'ph', 'cw', 'su', 'mi', 'pi', 'mp']

# Encoding Operations

data = binary_encode(

data,

columns=binary_features,

positive_values=binary_positive_values)

data = ordinal_encode(

data,

columns=ordinal_features,

orderings=ordinal_orderings

data = onehot_encode(

data,

columns=nominal_features,

prefixes=nominal_prefixes)

# Encoding 'Labels'

data = binary_encode(data, columns=['treatment'], positive_values=['Yes'])

# SPLITTING/SCALING

y = data['treatment'].copy()

X = data.drop('treatment', axis=1).copy()

scaler = StandardScaler() # mean and variance for each column

X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split( # derive train/test set

X, y, train_size=0.7, random_state=100)

# TRAINING

inputs = tf.keras.Input(shape=(X.shape[1],)) # number of features

x = tf.keras.layers.Dense(1024, activation='relu')(inputs)

x = tf.keras.layers.Dense(1024, activation='relu')(x)

outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(

optimizer='adam',

loss='binary_crossentropy',

metrics=[

'accuracy',

tf.keras.metrics.AUC(name='auc') # Class Distribution is 49.7% / 50.3% ])

batch_size = 64

epochs = 50

history = model.fit(

X_train,

y_train,

validation_split=0.2,

batch_size=batch_size,

epochs=epochs,

callbacks=[

# checks validation loss improvement against epochs -> lowers Learning Rate to allow model to converge

# forces Validation accuracy to plateau when model starts to overfit

tf.keras.callbacks.ReduceLROnPlateau() ])

# PLOTTING

plt.figure(figsize=(12, 6))

plt.plot(range(epochs), history.history['accuracy'], label="Training Accuracy")

plt.plot(range(epochs), # progression of metric 'accuracy' over each epoch

history.history['val_accuracy'], label="Validation Accuracy")

plt.xlabel("Epoch")

plt.ylabel("Accuracy")

plt.legend()

plt.title("Accuracy Over Time")

plt.show()