Day 22 Python学习打卡记录
复习日
仔细回顾一下之前21天的内容,没跟上进度的同学补一下进度。
作业:
自行学习参考如何使用kaggle平台,写下使用注意点,并对下述比赛提交代码
kaggle泰坦里克号人员生还预测https://www.kaggle.com/competitions/titanic/overview
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
df = pd.read_csv('./train.csv')
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
print(df.isnull().sum())
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(x='Survived', data=df)
plt.title('Survival Count')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

sns.countplot(x='Survived', hue='Sex', data=df)
plt.title('Survival by Gender')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.legend(title='Sex', labels=['Male', 'Female'])
plt.show()

sns.countplot(x='Survived', hue='Pclass', data=df)
plt.title('Survival by Passenger Class')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.legend(title='Pclass', labels=['1st Class', '2nd Class', '3rd Class'])
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns
sns.histplot(df['Age'], kde=True, bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

sns.kdeplot(data=df, x='Age', hue='Survived', fill=True)
plt.title('Survival by Age')
plt.xlabel('Age')
plt.ylabel('Density')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()

sns.histplot(df['Fare'], kde=True, bins=30)
plt.title('Fare Distribution')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

sns.countplot(x='Survived', hue='Embarked', data=df)
plt.title('Survival by Embarkation Port')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.legend(title='Embarked', labels=['Cherbourg (C)', 'Queenstown (Q)', 'Southampton (S)'])
plt.show()

correlation_matrix = df.select_dtypes(include=['number']).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
df['Title'] = df['Title'].replace(rare_titles, 'Rare')
df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
df['Title'] = df['Title'].map(title_mapping)
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title FamilySize IsAlone 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 0 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C 3 2 0 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 2 1 1 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S 3 2 0 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S 1 1 1
df['Age'] = df['Age'].fillna(df.groupby(['Pclass', 'Sex'])['Age'].transform('median'))
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 80], labels=[0, 1, 2, 3, 4])
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title FamilySize IsAlone AgeBin 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 0 2 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C 3 2 0 3 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 2 1 1 2 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S 3 2 0 2 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S 1 1 1 2
df['FareBin'] = pd.qcut(df['Fare'], q=4, labels=[0, 1, 2, 3])
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title FamilySize IsAlone AgeBin FareBin 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 0 2 0 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C 3 2 0 3 3 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 2 1 1 2 1 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S 3 2 0 2 3 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S 1 1 1 2 1
df = df.dropna(subset=['Embarked'])
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title FamilySize IsAlone AgeBin FareBin 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 0 2 0 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C 3 2 0 3 3 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 2 1 1 2 1 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S 3 2 0 2 3 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S 1 1 1 2 1
df['HasCabin'] = df['Cabin'].notna().astype(int)
df['Deck'] = df['Cabin'].fillna('U').str[0]
deck_counts = df['Deck'].value_counts()
rare_decks = deck_counts[deck_counts < 10].index
df['Deck'] = df['Deck'].replace(rare_decks, 'Other')
deck_dummies = pd.get_dummies(df['Deck'], prefix='Deck')
df = pd.concat([df, deck_dummies], axis=1)
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title FamilySize IsAlone AgeBin FareBin HasCabin Deck Deck_A Deck_B Deck_C Deck_D Deck_E Deck_F Deck_Other Deck_U 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 0 2 0 0 U 0 0 0 0 0 0 0 1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C 3 2 0 3 3 1 C 0 0 1 0 0 0 0 0 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 2 1 1 2 1 0 U 0 0 0 0 0 0 0 1 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S 3 2 0 2 3 1 C 0 0 1 0 0 0 0 0 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S 1 1 1 2 1 0 U 0 0 0 0 0 0 0 1
df['Ticket'] = df['Ticket'].fillna('')
parts = df['Ticket'].str.split()
df['TicketPrefix'] = parts.str[:-1].str.join(' ')
df['TicketNumber'] = parts.str[-1].where(parts.str[-1].str.isnumeric(), None)
df['HasTicketPrefix'] = (df['TicketPrefix'] != '').astype(int)
prefix_counts = df['TicketPrefix'].value_counts()
top = prefix_counts.nlargest(10).index
df['TicketPrefix2'] = df['TicketPrefix'].where(df['TicketPrefix'].isin(top), 'Other')
prefix_dummies = pd.get_dummies(df['TicketPrefix2'], prefix='TktPre')
df['TicketNumber'] = df['TicketNumber'].astype(float).fillna(0)
df['TicketNum_qbin'] = pd.qcut(df['TicketNumber'], 10, labels=False)
sizes = df.groupby('Ticket')['PassengerId'].transform('count')
df['TicketGroupSize'] = sizes
df['IsGroupTicket'] = (sizes > 1).astype(int)
df = pd.concat([df, prefix_dummies], axis=1)
df.drop(columns=['Ticket','TicketPrefix','TicketPrefix2'], inplace=True)
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Fare Cabin Embarked Title FamilySize IsAlone AgeBin FareBin HasCabin Deck Deck_A Deck_B Deck_C Deck_D Deck_E Deck_F Deck_Other Deck_U TicketNumber HasTicketPrefix TicketNum_qbin TicketGroupSize IsGroupTicket TktPre_ TktPre_A/5 TktPre_A/5. TktPre_C.A. TktPre_CA. TktPre_Other TktPre_PC TktPre_SOTON/O.Q. TktPre_SOTON/OQ TktPre_STON/O 2. TktPre_W./C. 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 7.2500 NaN S 1 2 0 2 0 0 U 0 0 0 0 0 0 0 1 21171.0 1 3 1 0 0 1 0 0 0 0 0 0 0 0 0 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 71.2833 C85 C 3 2 0 3 3 1 C 0 0 1 0 0 0 0 0 17599.0 1 3 1 0 0 0 0 0 0 0 1 0 0 0 0 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 7.9250 NaN S 2 1 1 2 1 0 U 0 0 0 0 0 0 0 1 3101282.0 1 9 1 0 0 0 0 0 0 1 0 0 0 0 0 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 53.1000 C123 S 3 2 0 2 3 1 C 0 0 1 0 0 0 0 0 113803.0 0 5 2 1 1 0 0 0 0 0 0 0 0 0 0 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 8.0500 NaN S 1 1 1 2 1 0 U 0 0 0 0 0 0 0 1 373450.0 0 9 1 0 1 0 0 0 0 0 0 0 0 0 0
mapping = {**dict.fromkeys(list("AB"),"Upper"),
**dict.fromkeys(list("CDE"),"Middle"),
"F":"Lower","U":"None"}
df["DeckGroup"] = df["Cabin"].fillna("U").str[0].map(mapping)
df = pd.concat([df, pd.get_dummies(df["DeckGroup"], prefix="Deck")], axis=1)
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Fare Cabin Embarked Title FamilySize IsAlone AgeBin FareBin HasCabin Deck Deck_A Deck_B Deck_C Deck_D Deck_E Deck_F Deck_Other Deck_U TicketNumber HasTicketPrefix TicketNum_qbin TicketGroupSize IsGroupTicket TktPre_ TktPre_A/5 TktPre_A/5. TktPre_C.A. TktPre_CA. TktPre_Other TktPre_PC TktPre_SOTON/O.Q. TktPre_SOTON/OQ TktPre_STON/O 2. TktPre_W./C. 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 7.2500 NaN S 1 2 0 2 0 0 U 0 0 0 0 0 0 0 1 21171.0 1 3 1 0 0 1 0 0 0 0 0 0 0 0 0 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 71.2833 C85 C 3 2 0 3 3 1 C 0 0 1 0 0 0 0 0 17599.0 1 3 1 0 0 0 0 0 0 0 1 0 0 0 0 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 7.9250 NaN S 2 1 1 2 1 0 U 0 0 0 0 0 0 0 1 3101282.0 1 9 1 0 0 0 0 0 0 1 0 0 0 0 0 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 53.1000 C123 S 3 2 0 2 3 1 C 0 0 1 0 0 0 0 0 113803.0 0 5 2 1 1 0 0 0 0 0 0 0 0 0 0 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 8.0500 NaN S 1 1 1 2 1 0 U 0 0 0 0 0 0 0 1 373450.0 0 9 1 0 1 0 0 0 0 0 0 0 0 0 0
print(df.columns.tolist())
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'AgeBin', 'FareBin', 'HasCabin', 'Deck', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_Other', 'Deck_U', 'TicketNumber', 'HasTicketPrefix', 'TicketNum_qbin', 'TicketGroupSize', 'IsGroupTicket', 'TktPre_', 'TktPre_A/5', 'TktPre_A/5.', 'TktPre_C.A.', 'TktPre_CA.', 'TktPre_Other', 'TktPre_PC', 'TktPre_SOTON/O.Q.', 'TktPre_SOTON/OQ', 'TktPre_STON/O 2.', 'TktPre_W./C.', 'DeckGroup', 'Deck_Lower', 'Deck_Middle', 'Deck_None', 'Deck_Upper']
to_drop = [
'PassengerId', 'Name',
'Cabin', 'Deck',
'TicketNumber',
'Age', 'Fare',
'Deck_A','Deck_B','Deck_C','Deck_D','Deck_E','Deck_F','Deck_Other','Deck_U',
'TktPre_','TktPre_A/5','TktPre_A/5.','TktPre_C.A.','TktPre_CA.',
'TktPre_SOTON/O.Q.','TktPre_SOTON/OQ','TktPre_STON/O 2.','TktPre_W./C.'
]
df_model = df.drop(columns=to_drop)
print(df.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Fare Cabin Embarked Title FamilySize IsAlone AgeBin FareBin HasCabin Deck Deck_A Deck_B Deck_C Deck_D Deck_E Deck_F Deck_Other Deck_U TicketNumber HasTicketPrefix TicketNum_qbin TicketGroupSize IsGroupTicket TktPre_ TktPre_A/5 TktPre_A/5. TktPre_C.A. TktPre_CA. TktPre_Other TktPre_PC TktPre_SOTON/O.Q. TktPre_SOTON/OQ TktPre_STON/O 2. TktPre_W./C. DeckGroup Deck_Lower Deck_Middle Deck_None Deck_Upper 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 7.2500 NaN S 1 2 0 2 0 0 U 0 0 0 0 0 0 0 1 21171.0 1 3 1 0 0 1 0 0 0 0 0 0 0 0 0 None 0 0 1 0 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 71.2833 C85 C 3 2 0 3 3 1 C 0 0 1 0 0 0 0 0 17599.0 1 3 1 0 0 0 0 0 0 0 1 0 0 0 0 Middle 0 1 0 0 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 7.9250 NaN S 2 1 1 2 1 0 U 0 0 0 0 0 0 0 1 3101282.0 1 9 1 0 0 0 0 0 0 1 0 0 0 0 0 None 0 0 1 0 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 53.1000 C123 S 3 2 0 2 3 1 C 0 0 1 0 0 0 0 0 113803.0 0 5 2 1 1 0 0 0 0 0 0 0 0 0 0 Middle 0 1 0 0 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 8.0500 NaN S 1 1 1 2 1 0 U 0 0 0 0 0 0 0 1 373450.0 0 9 1 0 1 0 0 0 0 0 0 0 0 0 0 None 0 0 1 0
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
drop_cols = ['Name','Cabin','Deck','DeckGroup','Embarked']
X_train = X_train.drop(columns=drop_cols)
X_val = X_val.drop(columns=drop_cols)
for col in ['AgeBin','FareBin']:
X_train[col] = X_train[col].cat.codes
X_val[col] = X_val[col].cat.codes
allowed = ['int64','float64','bool']
print(X_train.dtypes[~X_train.dtypes.isin(allowed)])
Age float64 Fare float64 IsAlone int32 AgeBin int8 FareBin int8 HasCabin int32 Deck_A uint8 Deck_B uint8 Deck_C uint8 Deck_D uint8 Deck_E uint8 Deck_F uint8 Deck_Other uint8 Deck_U uint8 TicketNumber float64 HasTicketPrefix int32 IsGroupTicket int32 TktPre_ uint8 TktPre_A/5 uint8 TktPre_A/5. uint8 TktPre_C.A. uint8 TktPre_CA. uint8 TktPre_Other uint8 TktPre_PC uint8 TktPre_SOTON/O.Q. uint8 TktPre_SOTON/OQ uint8 TktPre_STON/O 2. uint8 TktPre_W./C. uint8 Deck_Lower uint8 Deck_Middle uint8 Deck_None uint8 Deck_Upper uint8 dtype: object
from xgboost import XGBClassifier
xgb_model = XGBClassifier(
random_state=42,
use_label_encoder=False,
eval_metric='logloss',
enable_categorical=False
)
xgb_model.fit(X_train, y_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='logloss',
feature_types=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=None, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None, random_state=42, ...)
from sklearn.metrics import accuracy_score, classification_report
y_pred = xgb_model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"XGBoost Accuracy: {acc:.4f}")
print(classification_report(y_val, y_pred))
XGBoost Accuracy: 0.8146
precision recall f1-score support
0 0.83 0.88 0.85 110
1 0.79 0.71 0.74 68
accuracy 0.81 178
macro avg 0.81 0.79 0.80 178
weighted avg 0.81 0.81 0.81 178
浙大疏锦行
作者:qq_58459892