Code
#!python3 -m pip install ydata-profilingTony Duan
October 15, 2023

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from siuba import _, mutate, filter, group_by, summarize,show_query
from siuba import *
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
#%matplotlib inline
# machine learning
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from pandas import get_dummiesfrom https://www.kaggle.com/c/titanic/data
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
'Ticket' 'Fare' 'Cabin' 'Embarked']
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 418 non-null int64
1 Pclass 418 non-null int64
2 Name 418 non-null object
3 Sex 418 non-null object
4 Age 332 non-null float64
5 SibSp 418 non-null int64
6 Parch 418 non-null int64
7 Ticket 418 non-null object
8 Fare 417 non-null float64
9 Cabin 91 non-null object
10 Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
get_dummies()| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
for model x:
for final x:
67.79
83.65
98.25
| Survived | correct | |
|---|---|---|
| 0 | 0 | 0 |
| 1 | 0 | 0 |
| 2 | 0 | 0 |
| 3 | 1 | 1 |
| 4 | 0 | 0 |
guess all are 0
# one way create 'compare' using mutate from siuba
base_data=(y_train.to_frame()).reset_index(drop=True)>>mutate(pred=0)>>mutate(compare=if_else(_.Survived==_.pred,1,0))
# or create 'compare' using np.where()
base_data=y_train.to_frame()>>mutate(pred=0)
base_data['compare'] = np.where(base_data['Survived']!= base_data['pred'], 0, 1)the dummy acc is 60%
using Random forest:
| PassengerId | Survived | |
|---|---|---|
| 0 | 892 | 0 |
| 1 | 893 | 0 |
| 2 | 894 | 1 |
| 3 | 895 | 1 |
| 4 | 896 | 0 |
https://www.kaggle.com/c/titanic/data
https://kevinwang09.github.io/compare-r-python/workflows.html
https://www.kaggle.com/code/startupsci/titanic-data-science-solutions
---
title: "Machine learning 1 in python with sklearn"
author: "Tony Duan"
date: "2023-10-15"
categories: [Python]
execute:
warning: false
error: false
format:
html:
toc: true
toc-location: left
code-fold: show
code-tools: true
number-sections: true
code-block-bg: true
code-block-border-left: "#31BAE9"
---
{width="400"}
# package
```{python}
#!python3 -m pip install ydata-profiling
```
```{python}
#!python3 -m pip install pydantic-settings
```
```{python}
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from siuba import _, mutate, filter, group_by, summarize,show_query
from siuba import *
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
#%matplotlib inline
# machine learning
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from pandas import get_dummies
```
# data
```{python}
import os
os.listdir('data')
```
from https://www.kaggle.com/c/titanic/data
```{python}
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
combine = [train_df, test_df]
```
# EDA
```{python}
print(train_df.columns.values)
```
```{python}
# preview the data
train_df.head()
```
```{python}
train_df.info()
```
```{python}
test_df.info()
```
```{python}
train_df.describe()
```
```{python}
profile_report=ProfileReport(train_df, title=f"Pandas Profiling Report for Titanic Dataset")
```
```{python}
profile_report.to_file("profile.html")
```
```{python}
profile_report.to_notebook_iframe()
```
# data clean
## target variable:
```{python}
y = train_df["Survived"]
```
## feature engineering on model variable:
### one hot encode with `get_dummies()`
```{python}
one_hot_features = ["Pclass", "Sex", "SibSp", "Parch"]
train_df_encoded=get_dummies(train_df[one_hot_features])
test_df_encoded=get_dummies(test_df[one_hot_features])
```
### handling missing
```{python}
train_df.describe()
```
```{python}
missing_features = ["Age", "Fare"]
train_df_missing=train_df[missing_features].fillna(train_df[missing_features].mean())
test_df_missing=test_df[missing_features].fillna(test_df[missing_features].mean())
```
### combine
for model x:
```{python}
# appending multiple DataFrame
x = pd.concat([train_df_encoded, train_df_missing], axis=1, join='inner')
```
for final x:
```{python}
# appending multiple DataFrame
final_x = pd.concat([test_df_encoded, test_df_missing], axis=1, join='inner')
```
## split
```{python}
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.9, random_state = 123)
```
```{python}
x_train.shape
```
```{python}
x_test.shape
```
```{python}
x_train.head()
```
# model
## SV
```{python}
# Support Vector Machines
svc = SVC()
svc.fit(x_train, y_train)
#acc on trainning data
acc_svc_train = round(svc.score(x_train, y_train) * 100, 2)
print(acc_svc_train)
```
```{python}
#acc on testing data
acc_svc_test = round(svc.score(x_test, y_test) * 100, 2)
print(acc_svc_test)
#predication on test
svc_pred = svc.predict(final_x)
```
## KNN
```{python}
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
#acc on training data
acc_knn_train = round(knn.score(x_train, y_train) * 100, 2)
print(acc_knn_train)
```
```{python}
#acc on testing data
acc_knn_test=round(knn.score(x_test, y_test) * 100, 2)
print(acc_knn_test)
#predication on test
knn_pred = knn.predict(final_x)
```
## Random Forest
```{python}
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
#acc on training data
acc_random_forest_train = round(random_forest.score(x_train, y_train) * 100, 2)
print(acc_random_forest_train)
```
```{python}
#acc on testing data
acc_random_forest_test = round(random_forest.score(x_test, y_test) * 100, 2)
print(acc_random_forest_test)
Y_pred = random_forest.predict(final_x)
```
# Benchmark
```{python}
y_train.head()
```
```{python}
y_train_data=(y_train.to_frame())
y_train_data=y_train_data[10:15]
```
```{python}
y_train_data
```
```{python}
y_train_data=y_train_data.reset_index(drop=True)
y_train_data
```
```{python}
y_train_data2=y_train_data>>mutate(correct=if_else(_.Survived>0,1,0))
y_train_data2
```
```{python}
from collections import Counter
Counter(y_train)
```
guess all are 0
```{python}
# one way create 'compare' using mutate from siuba
base_data=(y_train.to_frame()).reset_index(drop=True)>>mutate(pred=0)>>mutate(compare=if_else(_.Survived==_.pred,1,0))
# or create 'compare' using np.where()
base_data=y_train.to_frame()>>mutate(pred=0)
base_data['compare'] = np.where(base_data['Survived']!= base_data['pred'], 0, 1)
```
the dummy acc is 60%
```{python}
sum(base_data['compare'])/count(base_data)
```
## final perdication
using Random forest:
```{python}
submission = pd.DataFrame({
"PassengerId": test_df["PassengerId"],
"Survived": Y_pred
})
submission.head()
```
```{python}
submission.to_csv('submission.csv',index=False)
```
# Reference
https://www.kaggle.com/c/titanic/data
https://kevinwang09.github.io/compare-r-python/workflows.html
https://www.kaggle.com/code/startupsci/titanic-data-science-solutions