首页 > 技术文章 > 用户流失预测

jmchen 2020-09-05 15:49 原文

Churn Modeling

"""
# @Time    :  2020/9/5
# @Author  :  Jimou Chen
"""
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


def deal_train(path):
    train_data = pd.read_csv(path)
    # 处理国家转换为数字
    train_data.loc[train_data['Geography'] == 'France', 'Geography'] = 1
    train_data.loc[train_data['Geography'] == 'Spain', 'Geography'] = 2
    train_data.loc[train_data['Geography'] == 'Germany', 'Geography'] = 3
    # 处理性别
    train_data.loc[train_data['Gender'] == 'Female', 'Gender'] = 0
    train_data.loc[train_data['Gender'] == 'Male', 'Gender'] = 1

    # 选取有用的特征
    feature = ['CreditScore', 'Geography', 'Gender',
               'Age', 'Tenure', 'Balance', 'NumOfProducts',
               'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    x_data = train_data[feature]
    y_data = train_data['Exited']

    # 对数据进行标准化
    sc = StandardScaler()
    x_data = sc.fit_transform(x_data)

    return x_data, y_data


if __name__ == '__main__':
    x_train_data, y_train_data = deal_train('data/Churn-Modelling.csv')
    x_test, y_test = deal_train('data/Churn-Modelling-Test-Data.csv')

    # 建模,可以多试试其他模型
    lr = LogisticRegression()
    lr.fit(x_train_data, y_train_data)

    # 预测
    pred = lr.predict(x_test)
    print(classification_report(pred, y_test))
    print(lr.score(x_test, y_test))
D:\Anaconda\Anaconda3\python.exe D:/Appication/PyCharm/Git/kaggle-project/ChurnModelling/churn_predict.py
              precision    recall  f1-score   support

           0       0.96      0.76      0.85       934
           1       0.15      0.61      0.25        66

    accuracy                           0.75      1000
   macro avg       0.56      0.69      0.55      1000
weighted avg       0.91      0.75      0.81      1000

0.754

Process finished with exit code 0
  • 重点是处理好数据
  • 可以使用其他模型进行优化

推荐阅读