Python训练营第九日实战演练指南
知识点见示例代码
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('data.csv')
df.info()
df.head()
mappings = {
"Years in current job": {
"10+ years": 10,
"2 years": 2,
"3 years": 3,
"< 1 year": 0,
"5 years": 5,
"1 year": 1,
"4 years": 4,
"6 years": 6,
"7 years": 7,
"8 years": 8,
"9 years": 9
},
"Home Ownership": {
"Home Mortgage": 0,
"Rent": 1,
"Own Home": 2,
"Have Mortgage": 3
}
}
df["Years in current job"] = df["Years in current job"].map(mappings["Years in current job"])
df["Home Ownership"] = df["Home Ownership"].map(mappings["Home Ownership"])
df.head()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
continuous_features = [
'Annual Income', 'Years in current job', 'Tax Liens',
'Number of Open Accounts', 'Years of Credit History',
'Maximum Open Credit', 'Number of Credit Problems',
'Months since last delinquent', 'Bankruptcies',
'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
'Credit Score'
]
correlation_matrix = df[continuous_features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix of Continuous Features')
plt.show()
features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
i=0
feature=features[i]
axes[0,0].boxplot(df[feature].dropna())
axes[0,0].set_title(f'boxplot of {feature}')
axes[0,0].set_ylabel(feature)
i=1
feature=features[i]
axes[0,1].boxplot(df[feature].dropna())
axes[0,1].set_title(f'boxplot of {feature}')
axes[0,1].set_ylabel(feature)
i=2
feature=features[i]
axes[1,0].boxplot(df[feature].dropna())
axes[1,0].set_title(f'boxplot of {feature}')
axes[1,0].set_ylabel(feature)
i=3
feature=features[i]
axes[1,1].boxplot(df[feature].dropna())
axes[1,1].set_title(f'boxplot of {feature}')
axes[1,1].set_ylabel(feature)
plt.tight_layout()
plt.show()
features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for i in range(len(features)):
feature=features[i]
axes[i//2,i%2].boxplot(df[feature].dropna())
axes[i//2,i%2].set_title(f'boxplot of {feature}')
axes[i//2,i%2].set_ylabel(feature)
features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
for i, feature in enumerate(features):
print(f'索引: {i}, 特征: {feature}')
features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for i, feature in enumerate(features):
axes[i//2,i%2].boxplot(df[feature].dropna())
axes[i//2,i%2].set_title(f'boxplot of {feature}')
axes[i//2,i%2].set_ylabel(feature)
作业:心脏病数据集的特征用上述知识完成,一次性用所有的处理方式完成预处理,尝试手动完成。
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
df = pd.read_csv('heart.csv')
#检查缺失值
df.isnull().sum()
features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
fig, axes = plt.subplots(4, 4, figsize=(20, 20))
for i, feature in enumerate(features):
feature=features[i]
axes[i//4,i%4].boxplot(df[feature])
axes[i//4,i%4].set_title(f'boxplot of {feature}')
axes[i//4,i%4].set_ylabel(feature)
features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
correlation_matrix = df[features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix Of Heart Disease Dataset')
plt.show()
作者:冬天给予的预感