代码收藏家技术教程 2025-05-26

Python训练营第九日实战演练指南

知识点见示例代码

字典的简单介绍

标签编连续特征的处理：归一化和标准化

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('data.csv')
df.info()
df.head()

mappings = {
    "Years in current job": {
        "10+ years": 10,
        "2 years": 2,
        "3 years": 3,
        "< 1 year": 0,
        "5 years": 5,
        "1 year": 1,
        "4 years": 4,
        "6 years": 6,
        "7 years": 7,
        "8 years": 8,
        "9 years": 9
    },
    "Home Ownership": {
        "Home Mortgage": 0,
        "Rent": 1,
        "Own Home": 2,
        "Have Mortgage": 3
    }
}
df["Years in current job"] = df["Years in current job"].map(mappings["Years in current job"])
df["Home Ownership"] = df["Home Ownership"].map(mappings["Home Ownership"])
df.head()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

continuous_features = [
    'Annual Income', 'Years in current job', 'Tax Liens',
    'Number of Open Accounts', 'Years of Credit History',
    'Maximum Open Credit', 'Number of Credit Problems',
    'Months since last delinquent', 'Bankruptcies',
    'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
    'Credit Score'
]

correlation_matrix = df[continuous_features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix of Continuous Features')
plt.show()
features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
i=0
feature=features[i]
axes[0,0].boxplot(df[feature].dropna())
axes[0,0].set_title(f'boxplot of {feature}')
axes[0,0].set_ylabel(feature)
i=1
feature=features[i]
axes[0,1].boxplot(df[feature].dropna())
axes[0,1].set_title(f'boxplot of {feature}')
axes[0,1].set_ylabel(feature)
i=2
feature=features[i]
axes[1,0].boxplot(df[feature].dropna())
axes[1,0].set_title(f'boxplot of {feature}')
axes[1,0].set_ylabel(feature)
i=3
feature=features[i]
axes[1,1].boxplot(df[feature].dropna())
axes[1,1].set_title(f'boxplot of {feature}')
axes[1,1].set_ylabel(feature)
plt.tight_layout()
plt.show()
features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for i in range(len(features)):
    feature=features[i]
    axes[i//2,i%2].boxplot(df[feature].dropna())
    axes[i//2,i%2].set_title(f'boxplot of {feature}')
    axes[i//2,i%2].set_ylabel(feature)
features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
for i, feature in enumerate(features):
    print(f'索引: {i}, 特征: {feature}')

features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for i, feature in enumerate(features):
    axes[i//2,i%2].boxplot(df[feature].dropna())
    axes[i//2,i%2].set_title(f'boxplot of {feature}')
    axes[i//2,i%2].set_ylabel(feature)

作业：心脏病数据集的特征用上述知识完成，一次性用所有的处理方式完成预处理，尝试手动完成。

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

df = pd.read_csv('heart.csv')
#检查缺失值
df.isnull().sum()
features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
fig, axes = plt.subplots(4, 4, figsize=(20, 20))
for i, feature in enumerate(features):
    feature=features[i]
    axes[i//4,i%4].boxplot(df[feature])
    axes[i//4,i%4].set_title(f'boxplot of {feature}')
    axes[i//4,i%4].set_ylabel(feature)
features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
correlation_matrix = df[features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix Of Heart Disease Dataset')
plt.show()

作者：冬天给予的预感

物联沃分享整理
物联沃-IOTWORD物联网 » Python训练营第九日实战演练指南