1 package

Code

#!python3 -m pip install ydata-profiling

Code

#!python3 -m pip install pydantic-settings

Code

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

from siuba import _, mutate, filter, group_by, summarize,show_query
from siuba import *

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
#%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from pandas import get_dummies

2 data

Code

import os
os.listdir('data')

['test.csv', 'train.csv', 'gender_submission.csv']

from https://www.kaggle.com/c/titanic/data

Code

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
combine = [train_df, test_df]

3 EDA

Code

print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']

Code

# preview the data
train_df.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

Code

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

Code

test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB

Code

train_df.describe()

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

Code

profile_report=ProfileReport(train_df, title=f"Pandas Profiling Report for Titanic Dataset")

Code

profile_report.to_file("profile.html")

Code

profile_report.to_notebook_iframe()

4 data clean

4.1 target variable:

Code

y = train_df["Survived"]

4.2 feature engineering on model variable:

4.2.1 one hot encode with `get_dummies()`

Code

one_hot_features = ["Pclass", "Sex", "SibSp", "Parch"]

train_df_encoded=get_dummies(train_df[one_hot_features])

test_df_encoded=get_dummies(test_df[one_hot_features])

4.2.2 handling missing

Code

train_df.describe()

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

Code

missing_features = ["Age", "Fare"]

train_df_missing=train_df[missing_features].fillna(train_df[missing_features].mean())

test_df_missing=test_df[missing_features].fillna(test_df[missing_features].mean())

4.2.3 combine

for model x:

Code

# appending multiple DataFrame
x = pd.concat([train_df_encoded, train_df_missing], axis=1, join='inner')

for final x:

Code

# appending multiple DataFrame
final_x = pd.concat([test_df_encoded, test_df_missing], axis=1, join='inner')

4.3 split

Code

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.9, random_state = 123)

Code

x_train.shape

(801, 7)

Code

x_test.shape

(90, 7)

Code

x_train.head()

	Pclass	SibSp	Parch	Sex_female	Sex_male	Age	Fare
677	3	0	0	True	False	18.000000	9.8417
547	2	0	0	False	True	29.699118	13.8625
317	2	0	0	False	True	54.000000	14.0000
261	3	4	2	False	True	3.000000	31.3875
273	1	0	1	False	True	37.000000	29.7000

5 model

5.1 SV

Code

# Support Vector Machines
svc = SVC()
svc.fit(x_train, y_train)

#acc on trainning data
acc_svc_train = round(svc.score(x_train, y_train) * 100, 2)
print(acc_svc_train)

67.79

Code

#acc on testing data
acc_svc_test = round(svc.score(x_test, y_test) * 100, 2)
print(acc_svc_test)

#predication on test
svc_pred = svc.predict(final_x)

75.56

5.2 KNN

Code

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)

#acc on training data
acc_knn_train = round(knn.score(x_train, y_train) * 100, 2)
print(acc_knn_train)

83.65

Code

#acc on testing data
acc_knn_test=round(knn.score(x_test, y_test) * 100, 2)
print(acc_knn_test)

#predication on test
knn_pred = knn.predict(final_x)

76.67

5.3 Random Forest

Code

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)

#acc on training data
acc_random_forest_train = round(random_forest.score(x_train, y_train) * 100, 2)
print(acc_random_forest_train)

98.25

Code

#acc on testing data
acc_random_forest_test = round(random_forest.score(x_test, y_test) * 100, 2)
print(acc_random_forest_test)

Y_pred = random_forest.predict(final_x)

82.22

6 Benchmark

Code

y_train.head()

677    1
547    1
317    0
261    1
273    0
Name: Survived, dtype: int64

Code

y_train_data=(y_train.to_frame())
y_train_data=y_train_data[10:15]

Code

y_train_data

	Survived
164	0
659	0
536	0
875	1
196	0

Code

y_train_data=y_train_data.reset_index(drop=True)

y_train_data

	Survived
0	0
1	0
2	0
3	1
4	0

Code

y_train_data2=y_train_data>>mutate(correct=if_else(_.Survived>0,1,0))
y_train_data2

	Survived	correct
0	0	0
1	0	0
2	0	0
3	1	1
4	0	0

Code

from collections import Counter
Counter(y_train)

Counter({0: 487, 1: 314})

guess all are 0

Code

# one way create 'compare' using mutate from siuba
base_data=(y_train.to_frame()).reset_index(drop=True)>>mutate(pred=0)>>mutate(compare=if_else(_.Survived==_.pred,1,0))

# or create 'compare' using np.where()
base_data=y_train.to_frame()>>mutate(pred=0)
base_data['compare'] = np.where(base_data['Survived']!= base_data['pred'], 0, 1)

the dummy acc is 60%

Code

sum(base_data['compare'])/count(base_data)

	n
0	0.60799

6.1 final perdication

using Random forest:

Code

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
    
submission.head()

	PassengerId	Survived
0	892	0
1	893	0
2	894	1
3	895	1
4	896	0

Code

submission.to_csv('submission.csv',index=False)

7 Reference

https://www.kaggle.com/c/titanic/data

https://kevinwang09.github.io/compare-r-python/workflows.html

https://www.kaggle.com/code/startupsci/titanic-data-science-solutions

1 package

2 data

3 EDA

4 data clean

4.1 target variable:

4.2 feature engineering on model variable:

4.2.1 one hot encode with get_dummies()

4.2.2 handling missing

4.2.3 combine

4.3 split

5 model

5.1 SV

5.2 KNN

5.3 Random Forest

6 Benchmark

6.1 final perdication

7 Reference

4.2.1 one hot encode with `get_dummies()`