Python训练营第九日实战演练指南

知识点见示例代码

  • 字典的简单介绍
  • 标签编连续特征的处理:归一化和标准化
  • import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import pandas as pd
    df = pd.read_csv('data.csv')
    df.info()
    df.head()
    
    mappings = {
        "Years in current job": {
            "10+ years": 10,
            "2 years": 2,
            "3 years": 3,
            "< 1 year": 0,
            "5 years": 5,
            "1 year": 1,
            "4 years": 4,
            "6 years": 6,
            "7 years": 7,
            "8 years": 8,
            "9 years": 9
        },
        "Home Ownership": {
            "Home Mortgage": 0,
            "Rent": 1,
            "Own Home": 2,
            "Have Mortgage": 3
        }
    }
    df["Years in current job"] = df["Years in current job"].map(mappings["Years in current job"])
    df["Home Ownership"] = df["Home Ownership"].map(mappings["Home Ownership"])
    df.head()
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    continuous_features = [
        'Annual Income', 'Years in current job', 'Tax Liens',
        'Number of Open Accounts', 'Years of Credit History',
        'Maximum Open Credit', 'Number of Credit Problems',
        'Months since last delinquent', 'Bankruptcies',
        'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
        'Credit Score'
    ]
    
    correlation_matrix = df[continuous_features].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
    plt.title('Correlation Matrix of Continuous Features')
    plt.show()
    features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    i=0
    feature=features[i]
    axes[0,0].boxplot(df[feature].dropna())
    axes[0,0].set_title(f'boxplot of {feature}')
    axes[0,0].set_ylabel(feature)
    i=1
    feature=features[i]
    axes[0,1].boxplot(df[feature].dropna())
    axes[0,1].set_title(f'boxplot of {feature}')
    axes[0,1].set_ylabel(feature)
    i=2
    feature=features[i]
    axes[1,0].boxplot(df[feature].dropna())
    axes[1,0].set_title(f'boxplot of {feature}')
    axes[1,0].set_ylabel(feature)
    i=3
    feature=features[i]
    axes[1,1].boxplot(df[feature].dropna())
    axes[1,1].set_title(f'boxplot of {feature}')
    axes[1,1].set_ylabel(feature)
    plt.tight_layout()
    plt.show()
    features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    for i in range(len(features)):
        feature=features[i]
        axes[i//2,i%2].boxplot(df[feature].dropna())
        axes[i//2,i%2].set_title(f'boxplot of {feature}')
        axes[i//2,i%2].set_ylabel(feature)
    features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
    for i, feature in enumerate(features):
        print(f'索引: {i}, 特征: {feature}')
    
    features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    for i, feature in enumerate(features):
        axes[i//2,i%2].boxplot(df[feature].dropna())
        axes[i//2,i%2].set_title(f'boxplot of {feature}')
        axes[i//2,i%2].set_ylabel(feature)
  • 作业:心脏病数据集的特征用上述知识完成,一次性用所有的处理方式完成预处理,尝试手动完成。

    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    
    df = pd.read_csv('heart.csv')
    #检查缺失值
    df.isnull().sum()
    features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
    fig, axes = plt.subplots(4, 4, figsize=(20, 20))
    for i, feature in enumerate(features):
        feature=features[i]
        axes[i//4,i%4].boxplot(df[feature])
        axes[i//4,i%4].set_title(f'boxplot of {feature}')
        axes[i//4,i%4].set_ylabel(feature)
    features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
    correlation_matrix = df[features].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
    plt.title('Correlation Matrix Of Heart Disease Dataset')
    plt.show()

    作者:冬天给予的预感

    物联沃分享整理
    物联沃-IOTWORD物联网 » Python训练营第九日实战演练指南

    发表回复