Mental Health Treatment Prediction Language Optimization
Lately, I've taken interest to modern psychiatric practices as a means for getting closer to solving the issue of mental health. Aside from novel medical practice, I see massive potential with language processing in combination with data analysis. With this project I looked into the OSMI Mental Health in Tech dataset and used it to make predictions on whether respondents sought out treatment. Using a TensorFlow ANN regressor on processed survey data the model accuracy hovered around approxiamately 67 to 69%. However, this doesn't take into account natural language data in the 'comments' column. The low model accuracy points to a proposition that the comment text contains hidden motives for treatment, which I explore next.
View Code
import numpy as np # 1.24.1
import pandas as pd # 1.5.2
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf # 2.11.0
data = pd.read_csv('survey.csv')
# CLEANING
langdata = data.loc[:, "comments"] # save comments to var langdata
langdata = langdata.dropna()
data = data.drop('comments', axis=1)
# data['self_emplyed'].unique() -> unique is NaN, Yes, No
# data['self_emplyed'].mode() -> 'No' is most common
data['self_employed'] = data['self_employed'].fillna('No')
data['work_interfere'] = data['work_interfere'].fillna('Sometimes')
# FEATURE ENGINEERING
data['Year'] = data['Timestamp'].apply(lambda x: np.int64(x[0:4]))
data['Month'] = data['Timestamp'].apply(lambda x: np.int64(x[5:7]))
data['Day'] = data['Timestamp'].apply(lambda x: np.int64(x[8:10]))
data['Hour'] = data['Timestamp'].apply(lambda x: np.int64(x[11:13]))
data['Minute'] = data['Timestamp'].apply(lambda x: np.int64(x[14:16]))
data['Second'] = data['Timestamp'].apply(lambda x: np.int64(x[17:19]))
data = data.drop('Timestamp', axis=1)
# ENCODING FEATURES
{column: len(data[column].unique())
for column in data.select_dtypes('object').columns} # hash for unique values
{column: list(data[column].unique())
for column in data.select_dtypes('object').columns} # format to list
def encode_gender(x): # 'gender' was unstructured entry
if x.lower()[0] == 'f': return 0
elif x.lower()[0] == 'm': return 1
else: return 2
data['Gender'] = data['Gender'].apply(encode_gender)
target = 'treatment'
binary_features = [
'self_employed',
'family_history',
'remote_work',
'tech_company',
'obs_consequence']
ordinal_features = [
'work_interfere',
'no_employees']
nominal_features = [
'Country',
'state',
'benefits',
'care_options',
'wellness_program',
'seek_help',
'anonymity',
'leave',
'mental_health_consequence',
'phys_health_consequence',
'coworkers',
'supervisor',
'mental_health_interview',
'phys_health_interview',
'mental_vs_physical', ]
def binary_encode(df, columns, positive_values):
df = df.copy()
for column, positive_value in zip(columns, positive_values):
df[column] = df[column].apply(
lambda x: 1 if x == positive_value else 0)
return df
def ordinal_encode(df, columns, orderings):
df = df.copy()
for column, ordering in zip(columns, orderings):
df[column] = df[column].apply(lambda x: ordering.index(x))
return df
def onehot_encode(df, columns, prefixes):
df = df.copy()
for column, prefix in zip(columns, prefixes):
dummies = pd.get_dummies(df[column], prefix)
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
binary_positive_values = ['Yes' for feature in binary_features]
ordinal_orderings = [
['Never', 'Rarely', 'Sometimes', 'Often'],
['1-5', '6-25', '26-100', '100-500', '500-1000', 'More than 1000']]
nominal_prefixes = ['co', 'st', 're', 'be', 'ca', 'we',
'se', 'an', 'le', 'mc', 'ph', 'cw', 'su', 'mi', 'pi', 'mp']
# Encoding Operations
data = binary_encode(
data,
columns=binary_features,
positive_values=binary_positive_values)
data = ordinal_encode(
data,
columns=ordinal_features,
orderings=ordinal_orderings
data = onehot_encode(
data,
columns=nominal_features,
prefixes=nominal_prefixes)
# Encoding 'Labels'
data = binary_encode(data, columns=['treatment'], positive_values=['Yes'])
# SPLITTING/SCALING
y = data['treatment'].copy()
X = data.drop('treatment', axis=1).copy()
scaler = StandardScaler() # mean and variance for each column
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split( # derive train/test set
X, y, train_size=0.7, random_state=100)
# TRAINING
inputs = tf.keras.Input(shape=(X.shape[1],)) # number of features
x = tf.keras.layers.Dense(1024, activation='relu')(inputs)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs, outputs)
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=[
'accuracy',
tf.keras.metrics.AUC(name='auc') # Class Distribution is 49.7% / 50.3% ])
batch_size = 64
epochs = 50
history = model.fit(
X_train,
y_train,
validation_split=0.2,
batch_size=batch_size,
epochs=epochs,
callbacks=[
# checks validation loss improvement against epochs -> lowers Learning Rate to allow model to converge
# forces Validation accuracy to plateau when model starts to overfit
tf.keras.callbacks.ReduceLROnPlateau() ])
# PLOTTING
plt.figure(figsize=(12, 6))
plt.plot(range(epochs), history.history['accuracy'], label="Training Accuracy")
plt.plot(range(epochs), # progression of metric 'accuracy' over each epoch
history.history['val_accuracy'], label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Accuracy Over Time")
plt.show()