May Merkle-Tan (MMT) -- Sept 2018¶
REwrite general analysis described in http://rpubs.com/hengrumay/Predict_PerformanceMode in Python.
- general task and analysis descriptions available in link
• Synopsis :
Wearable devices that monitor physical activity are on the rise and provide a wealth of useful information. Apart from measuring quantity of activity, assessing the manner in which an activity is performed could also improve remote human activity monitoring. The Weight Lifting Exercise Dataset (Velloso, Bulling, Gellersen, Ugulino, Fuks, 2013) provides a means to derive a “proof-of-concept” in decoding the mode of weight lifting performance. Data was acquired from sensors on the belt, forearm, arm, and dumbell worn by 6 participants as they performed barbell lifts either correctly and incorrectly in 5 different ways. Further information is available from http://groupware.les.inf.puc-rio.br/har#weight_lifting_exercises.Some differences:
- removed extreme data points (cf. Rcode)
- data-scaling (cf. Rcode)
- includes helper-functions for Plots, ML assessments as separate py-file
This version:
- no explicit grid-search | hypertuning involved...
import subprocess
save_fig = {'Yes': True, 'No': False}
Response = input('Save generated figures in this analysis? \n\nRespond with Yes|No ..... : ')
savefig = save_fig[Response]
if savefig :
print('\n\n >>> Saving figures ......')
#!mkdir -p figFolder
subprocess.call(['mkdir', '-p', 'figFolder']);
## Display Settings:
from IPython.display import display, HTML
display(HTML(data= """
<style>
div#notebook-container { width: 80%; }
div#menubar-container { width: 80%; }
div#maintoolbar-container { width: 80%; }
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style>
"""
)
)
## IMPORT/LOAD libraries and set Settings :
# import subprocess
import os
import glob
## Plotting -------------------------------------------------------------------------------------
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
#from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
sns.set_style(style = 'white')
sns.set_context("notebook",
font_scale=1.125,
rc={"lines.linewidth": 2.5})
## Arrays | ETL | DataFrames -------------------------------------------------------------------
import numpy as np
np.seterr(all='ignore')
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_colwidth',30)
pd.set_option('display.width', 1800)
from datetime import datetime #as dt
from dateutil.parser import parse
# ----------------------------------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# ML|Stats-related -----------------------------------------------------------------------------
# from scipy import stats
from sklearn import preprocessing # for scaling
#from sklearn.linear_model import LogisticRegression
#from sklearn.neural_network import MLPClassifier
# from sklearn import tree
from sklearn import ensemble
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef
from sklearn import cross_validation
# ### ML modeling -- load helper functions
import MMT_MLStatsPlotFuncs_PredictExCat_v0 as MMTfuncs
# DATA Source | http://groupware.les.inf.puc-rio.br/har#weight_lifting_exercises
trainURL = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
testURL = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
df_train = MMTfuncs.ReadRAWdataFromSource(trainURL)
# df_train.head()#.T
df_train.info()
# df_train.dtypes
df_test = MMTfuncs.ReadRAWdataFromSource(testURL)
# df_test.head()#.T
df_test.info()
## check NaNs...
df_train.isna().sum()
## check NaNs...
df_test.isna().sum()
df_train.classe.value_counts()
df_train.user_name.value_counts()
df_train.new_window.value_counts()
from sklearn.preprocessing import LabelEncoder lb_make = LabelEncoder() df_train[["classe_num"]] = df_train.apply(lambda x: lb_make.fit_transform(x)) df_train.head()
# Convert ExerciseCategories into NumericalCategories -- use Dict later to find predicted cases:
EXcat=dict()
for i, c in enumerate(df_train.classe.unique()):
EXcat[c]=i
def cat2num(x):
return(EXcat[x])
df_train['classe_num'] = df_train.classe.apply(lambda x: cat2num(x))
# keep variables with less than 15% NaNs...
tmpCols = df_train.columns[(df_train.isna().sum(axis=0)/len(df_train))<0.15].tolist()
# a better prediction will (ideally) not rely on user or time of the day etc.
# -- drop these other variables:
Cols2use = [c for c in tmpCols if c not in ['user_name',
'raw_timestamp_part_1',
'raw_timestamp_part_2',
'cvtd_timestamp',
'new_window',
'num_window',
]]
# Cols2use
### data Subset with Cols2use -----------------------------------------------------------------------
df_train2 = df_train[Cols2use].copy()
%time df_train2 = MMTfuncs.ExcludeOutliers(Cols2use, df_train, df_train2, sd=4)
print('RAW_TrainData_shape : ', df_train.shape)
print('subset_TrainData_shape : ', df_train2.shape)
print('%_VarColsUsed_from_TrainData :', format(df_train2.shape[1]/df_train.shape[1]*100) +'%')
# ((19622, 160), (19622, 54), #54/160= 0.3375)
## Exclusion of extreme values --> NaNs : Checking overall % of excluded data ----- less than 1% across Vars
def plot_checkNaNs(df_train2):
(df_train2.isna().sum()/len(df_train2)*100).plot(kind='bar', figsize=(14,4), color='lightblue')
plt.title('Percentage of Variable\'s Values as NaNs \n'+
'-- post eliminating extreme values',
size=16)
plt.ylabel('%-tage NaNs')
plt.show()
if savefig==False:
plot_checkNaNs(df_train2)
# Doesn't show plot if savefig==True
MMTfuncs.plotVarCorr_heatmap(df_train2,
Cols2use,
filename='figFolder/TrainData_Variable_correlations.pdf',
savefig=savefig)
# Doesn't show plots if savefig==True
forearmC = MMTfuncs.getSimilarColNames("(.*).(_forearm)", df_train2[Cols2use])
armC = MMTfuncs.getSimilarColNames("(.*).(_arm)", df_train2[Cols2use])
beltC = MMTfuncs.getSimilarColNames("(.*).(_belt)", df_train2[Cols2use])
dumbbellC = MMTfuncs.getSimilarColNames("(.*).(_dumbbell)", df_train2[Cols2use])
## Takes a while to generate the pairplots ....
MMTfuncs.ExploreVariableAssoc_byEXcat(df_train2, forearmC, armC, beltC, dumbbellC,
filename='figFolder/VariableAssociations_',
savefig=savefig)