import pandas as pd
import numpy as np
import warnings
#drop warnings generated by
warnings.filterwarnings('ignore')
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
sns.set(style='white',color_codes=True)
#load training and testing set
train=pd.read_csv('./input/train.csv')
test=pd.read_csv('./input/test.csv')
train.head()
|
ID |
var3 |
var15 |
imp_ent_var16_ult1 |
imp_op_var39_comer_ult1 |
imp_op_var39_comer_ult3 |
imp_op_var40_comer_ult1 |
imp_op_var40_comer_ult3 |
imp_op_var40_efect_ult1 |
imp_op_var40_efect_ult3 |
… |
saldo_medio_var33_hace2 |
saldo_medio_var33_hace3 |
saldo_medio_var33_ult1 |
saldo_medio_var33_ult3 |
saldo_medio_var44_hace2 |
saldo_medio_var44_hace3 |
saldo_medio_var44_ult1 |
saldo_medio_var44_ult3 |
var38 |
TARGET |
0 |
1 |
2 |
23 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
39205.170000 |
0 |
1 |
3 |
2 |
34 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
49278.030000 |
0 |
2 |
4 |
2 |
23 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
67333.770000 |
0 |
3 |
8 |
2 |
37 |
0 |
195 |
195 |
0 |
0 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
64007.970000 |
0 |
4 |
10 |
2 |
39 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
… |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
117310.979016 |
0 |
5 rows × 371 columns
df=pd.DataFrame(train.TARGET.value_counts())
df['Percentage']=100*df.TARGET/train.shape[0]
df
|
TARGET |
Percentage |
0 |
73012 |
96.043147 |
1 |
3008 |
3.956853 |
#from abrove result, it is a unbalanced dataset
#top ten most common values
train.var3.value_counts()[:10]
2 74165 8 138 -999999 116 9 110 3 108 1 105 13 98 7 97 4 86 12 85 Name: var3, dtype: int64
# var3 is the nationality of customer
# -99999 represent that nationality is unknown
# replace -99999 with the most common value (2)
train=train.replace(-999999,2)
train.loc[train.var3==-999999].shape
(0, 371)
#add feature that counts the number of zeros each row
X=train.iloc[:,:-1]
y=train.TARGET
X['n0']=(X==0).sum(axis=1)
train['n0']=X['n0']
train.describe()
|
ID |
var3 |
var15 |
imp_ent_var16_ult1 |
imp_op_var39_comer_ult1 |
imp_op_var39_comer_ult3 |
imp_op_var40_comer_ult1 |
imp_op_var40_comer_ult3 |
imp_op_var40_efect_ult1 |
imp_op_var40_efect_ult3 |
… |
saldo_medio_var33_hace3 |
saldo_medio_var33_ult1 |
saldo_medio_var33_ult3 |
saldo_medio_var44_hace2 |
saldo_medio_var44_hace3 |
saldo_medio_var44_ult1 |
saldo_medio_var44_ult3 |
var38 |
TARGET |
n0 |
count |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
… |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
mean |
75964.050723 |
2.716483 |
33.212865 |
86.208265 |
72.363067 |
119.529632 |
3.559130 |
6.472698 |
0.412946 |
0.567352 |
… |
1.365146 |
12.215580 |
8.784074 |
31.505324 |
1.858575 |
76.026165 |
56.614351 |
117235.809430 |
0.039569 |
335.426888 |
std |
43781.947379 |
9.447971 |
12.956486 |
1614.757313 |
339.315831 |
546.266294 |
93.155749 |
153.737066 |
30.604864 |
36.513513 |
… |
113.959637 |
783.207399 |
538.439211 |
2013.125393 |
147.786584 |
4040.337842 |
2852.579397 |
182664.598503 |
0.194945 |
17.836658 |
min |
1.000000 |
0.000000 |
5.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
… |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
5163.750000 |
0.000000 |
220.000000 |
25% |
38104.750000 |
2.000000 |
23.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
… |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
67870.612500 |
0.000000 |
325.000000 |
50% |
76043.000000 |
2.000000 |
28.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
… |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
106409.160000 |
0.000000 |
340.000000 |
75% |
113748.750000 |
2.000000 |
40.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
… |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
118756.252500 |
0.000000 |
348.000000 |
max |
151838.000000 |
238.000000 |
105.000000 |
210000.000000 |
12888.030000 |
21024.810000 |
8237.820000 |
11073.570000 |
6600.000000 |
6600.000000 |
… |
20385.720000 |
138831.630000 |
91778.730000 |
438329.220000 |
24650.010000 |
681462.900000 |
397884.300000 |
22034738.760000 |
1.000000 |
361.000000 |
8 rows × 372 columns
#num_var4 is the number of products
#plot the number of products
train.num_var4.hist(bins=100)
plt.xlabel('number of products')
plt.ylabel('number of customers')
plt.title('most customer with one product')
plt.show()
#let's look at the density of happy / unhappy customer in the function of bank products
sns.FacetGrid(train,hue='TARGET',size=7).map(plt.hist,'num_var4').add_legend()
plt.title('unhappy customer with less bank products')
plt.show()
train[train.TARGET==1].num_var4.hist(bins=6)
plt.title('the number of unhappy customer in function of bank products')
plt.show()
# var38 is supposed to be the value of customers
train.var38.describe()
count 76020.000000 mean 117235.809430 std 182664.598503 min 5163.750000 25% 67870.612500 50% 106409.160000 75% 118756.252500 max 22034738.760000 Name: var38, dtype: float64
#how is var38 looking when customer is unhappy
train.loc[train.TARGET==1,'var38'].describe()
count 3008.000000 mean 99678.280590 std 106309.811490 min 11136.630000 25% 57160.942500 50% 86219.970000 75% 117310.979016 max 3988595.100000 Name: var38, dtype: float64
#histogram for var_38
train.var38.hist(bins=1000)
train.var38.map(np.log).hist(bins=1000)
train.var38.value_counts()
117310.979016 14868 451931.220000 16 463625.160000 12 104563.800000 11 288997.440000 11 236690.340000 8 67088.310000 7 128318.520000 7 329603.970000 7 125722.440000 7 104644.410000 7 70813.800000 6 163432.470000 6 105260.880000 6 97639.560000 6 185385.690000 6 100466.730000 6 168733.620000 6 127141.500000 5 227397.720000 5 71302.530000 5 235476.720000 5 192920.760000 5 33184.020000 5 185784.720000 5 208961.790000 5 83174.280000 5 171932.700000 5 121603.020000 5 229351.650000 5 … 67239.600000 1 84077.580000 1 83315.520000 1 84145.410000 1 84203.250000 1 111706.230000 1 84202.590000 1 215271.630000 1 84198.450000 1 84196.500000 1 84194.940000 1 83321.910000 1 117971.910000 1 84190.050000 1 84187.410000 1 84185.040000 1 84182.670000 1 84181.950000 1 215252.280000 1 84179.850000 1 84178.770000 1 84177.090000 1 477388.740000 1 84169.110000 1 84167.880000 1 84162.270000 1 84160.980000 1 215230.230000 1 84150.720000 1 131072.070000 1 Name: var38, dtype: int64
#what if exclude the most common values
train.loc[~np.isclose(train.var38,117310.979016),'var38'].value_counts()
451931.22 16 463625.16 12 104563.80 11 288997.44 11 236690.34 8 128318.52 7 104644.41 7 125722.44 7 329603.97 7 67088.31 7 70813.80 6 185385.69 6 100466.73 6 168733.62 6 163432.47 6 105260.88 6 97639.56 6 171932.70 5 148781.16 5 131353.47 5 185784.72 5 53324.46 5 127141.50 5 63820.89 5 121603.02 5 235476.72 5 83174.28 5 85814.04 5 276030.57 5 71302.53 5 .. 84483.51 1 84482.01 1 477748.14 1 84535.56 1 84577.77 1 84536.04 1 84574.05 1 215645.88 1 84571.65 1 84570.66 1 84569.88 1 84567.84 1 84565.59 1 84563.91 1 84562.95 1 84560.67 1 84559.17 1 84558.96 1 84557.91 1 84556.50 1 84555.27 1 84545.04 1 215616.45 1 65688.57 1 84541.53 1 84540.33 1 84539.79 1 84538.08 1 84537.51 1 131072.07 1 Name: var38, dtype: int64
#exclude the most common values and look at it's distribution
train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log).hist(bins=100)
#above results imply us to split up var38 into two varibles
#var38mc=1 when var38 has the most common values otherwise var38mc=0
train['var38mc']=np.isclose(train.var38,117310.979016)
#logvar38 is log(var38) when var38mc is zero, otherwise is zero
train['logvar38']=train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log)
train.loc[train['var38mc'],'logvar38']=0
#check for nan
print("the number of nan in var38mc is",train.var38mc.isnull().sum())
print('the number of nan in logvar38 is ',train.logvar38.isnull().sum())
the number of nan in var38mc is 0 the number of nan in logvar38 is 0
#var15 is the age of customer
train.var15.describe()
count 76020.000000 mean 33.212865 std 12.956486 min 5.000000 25% 23.000000 50% 28.000000 75% 40.000000 max 105.000000 Name: var15, dtype: float64
train.var15.hist(bins=100)
sns.FacetGrid(train,hue='TARGET',size=6).map(sns.kdeplot,'var15').add_legend()
plt.title('unhappy customers are slightly order')
train.saldo_var30.hist(bins=100)
plt.xlim(0,train.saldo_var30.max())
(0, 3458077.3199999998)
# improve the plot by making the x axis logarithmic
#train['log_saldo_var30'] = train.saldo_var30.map(np.log)
sns.FacetGrid(train, hue="TARGET", size=20) \
.map(sns.kdeplot, "saldo_var30") \
.add_legend();
#explore the interaction of var15 and var38
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.scatter,'var38','var15').add_legend()
# Exclude most common value for var38
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
.map(plt.scatter, "logvar38", "var15") \
.add_legend()
plt.ylim([0,120]);
sns.FacetGrid(train, hue="TARGET", size=10) \
.map(plt.scatter, "logvar38", "var15") \
.add_legend()
plt.ylim([0,120]); # Age must be positive ;-)
# What is distribution of the age when var38 has it's most common value ?
sns.FacetGrid(train[train.var38mc], hue="TARGET", size=6) \
.map(sns.kdeplot, "var15") \
.add_legend();
# What is density of n0 ?
sns.FacetGrid(train, hue="TARGET", size=6) \
.map(sns.kdeplot, "n0") \
.add_legend()
plt.title('Unhappy customers have a lot of features that are zero');
![png](output_28_0.png)
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale
# First select features based on chi2 and f_classif
p = 3
X_bin = Binarizer().fit_transform(scale(X))
selectChi2 = S gelectPercentile(chi2, percentile=p).fit(X_bin, y)
selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)
chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
chi2_selected_features))
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
f_classif_selected_features))
selected = chi2_selected & f_classif_selected
print('Chi2 & F_classif selected {} features'.format(selected.sum()))
features = [ f for f,s in zip(X.columns, selected) if s]
print (features)
Chi2 selected 12 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var5’, ‘num_var8_0’, ‘num_var30_0’, ‘num_var30’, ‘num_var42’, ‘saldo_var30’, ‘var36’, ‘num_meses_var5_ult3’]. F_classif selected 12 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var4’, ‘num_var5’, ‘num_var30’, ‘num_var35’, ‘num_var42’, ‘var36’, ‘num_meses_var5_ult3’, ‘n0’]. Chi2 & F_classif selected 9 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var5’, ‘num_var30’, ‘num_var42’, ‘var36’, ‘num_meses_var5_ult3’]
X_sel=train[features+['TARGET']]
X_sel.info()
X_sel.describe()
|
var15 |
ind_var5 |
ind_var8_0 |
ind_var30 |
num_var5 |
num_var30 |
num_var42 |
var36 |
num_meses_var5_ult3 |
TARGET |
count |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
76020.000000 |
mean |
33.212865 |
0.663760 |
0.032833 |
0.732833 |
1.999171 |
2.382873 |
2.217995 |
40.449079 |
1.979979 |
0.039569 |
std |
12.956486 |
0.472425 |
0.178202 |
0.442483 |
1.431902 |
1.642787 |
1.497703 |
47.362719 |
1.298924 |
0.194945 |
min |
5.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
25% |
23.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
2.000000 |
0.000000 |
0.000000 |
50% |
28.000000 |
1.000000 |
0.000000 |
1.000000 |
3.000000 |
3.000000 |
3.000000 |
3.000000 |
3.000000 |
0.000000 |
75% |
40.000000 |
1.000000 |
0.000000 |
1.000000 |
3.000000 |
3.000000 |
3.000000 |
99.000000 |
3.000000 |
0.000000 |
max |
105.000000 |
1.000000 |
1.000000 |
1.000000 |
15.000000 |
33.000000 |
18.000000 |
99.000000 |
3.000000 |
1.000000 |
sns.FacetGrid(X_sel,hue='TARGET',size=6).map(sns.kdeplot,'var36').add_legend()
plt.title('the unhappy customer is smaller when var36 is not 99')
X_sel.var36.value_counts()
99 30064
3 22177
1 14664
2 8704
0 411
Name: var36, dtype: int64
# var36 in function of var38 (most common value excluded)
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
.map(plt.scatter, "var36", "logvar38") \
.add_legend();
sns.FacetGrid(train[(~train.var38mc)&(train.var36<4)],hue='TARGET',size=10).map(plt.scatter,'var36','logvar38').add_legend()
plt.title('when var36 is zero there is all unhappy customer')
#look at the value of var38 when var36==99
sns.FacetGrid(train[(~train.var38mc)&(train.var36==99)], hue="TARGET", size=10) \
.map(sns.kdeplot, 'logvar38') \
.add_legend();
train.num_var5.value_counts()
3 50265
0 25561
6 190
9 3
15 1
Name: num_var5, dtype: int64
train[train.TARGET==0].num_var5.value_counts()
3 49223
0 23602
6 183
9 3
15 1
Name: num_var5, dtype: int64
train[train.TARGET==1].num_var5.value_counts()
0 1959
3 1042
6 7
Name: num_var5, dtype: int64
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.hist,'num_var5').add_legend()
sns.FacetGrid(train, hue="TARGET", size=6) \
.map(sns.kdeplot, "num_var5") \
.add_legend();