1.10. 银行欺诈检测比赛#

PDF请访问银行欺诈检测.pdf

1.10.1. Initialization#

# 导入包 用于读取文件
import pandas as pd

读数据,合并

# train = pd.read_csv('data/bankfraud_train.csv')
# test = pd.read_csv('data/bankfraud_test.csv')
# 合并train 和 test 集
data = pd.concat([train, test], axis=0)
# 用于其他features文件的合并
# data = pd.merge(data, car_price,on='auto_model', how='outer')
# 声明一些变量
# 美国大城市,用于feature engineering
top_cities = ["NewYork","LosAngeles","Chicago","Houston","Phoenix","Philadelphia","SanAntonio","SanDiego","Dallas","SanJose","Austin","Jacksonville","FortWorth","Columbus","Indianapolis","Charlotte","SanFrancisco","Seattle","Denver","Washington","Nashville","OklahomaCity","ElPaso","Boston","Portland","LasVegas","Detroit","Memphis","Louisville","Baltimore","Milwaukee","Albuquerque","Tucson","Fresno","Sacramento","KansasCity","Mesa","Atlanta","Omaha","ColoradoSprings","Raleigh","LongBeach","VirginiaBeach","Miami","Oakland","Minneapolis","Tulsa","Bakersfield","Wichita","Arlington"]

1.10.2. view the data#

train['incident_city'].value_counts()
Springfield    117
Arlington      110
Columbus       108
Northbend       96
Hillsdale       96
Riverwood       90
Northbrook      83
Name: incident_city, dtype: int64
data['auto_model'].value_counts().count()
39
# 用于查看数据中非数字列特殊值的个数
for col in data.select_dtypes(include=object).columns:
    # nqunique是一个方法,记得加括号
	print(col, data[col].nunique())
policy_bind_date 951
policy_state 3
policy_csl 3
insured_sex 2
insured_education_level 7
insured_occupation 14
insured_hobbies 20
insured_relationship 6
incident_date 60
incident_type 4
collision_type 4
incident_severity 4
authorities_contacted 5
incident_state 7
incident_city 7
incident_location 1000
property_damage 3
police_report_available 3
auto_make 14
auto_model 39

1.10.3. Feature engineering#

1.10.3.1. 把日期转换成更有意义的特征,列入星期几#

data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], errors='coerce')
data['incident_date'] = pd.to_datetime(data['incident_date'], errors='coerce')
data['policy_bind_date_weekday'] = data['policy_bind_date'].dt.weekday
data['incident_date_weekday'] = data['incident_date'].dt.weekday
base_date = data['policy_bind_date'].min()
data['p_diff'] = (data['policy_bind_date'] - base_date).dt.days
data['i_diff'] = (data['incident_date'] - base_date).dt.days
data.drop(['policy_bind_date', 'incident_date'], axis=1, inplace=True)
# 日期求差值
data['pi_diff'] = data['p_diff'] - data['i_diff']

1.10.3.2. 手动对某个特征做one-hot处理#

# t = pd.read_csv('data/bankfraud_train.csv')
# temp = pd.DataFrame(columns=["months_as_customer"])
# temp = t["months_as_customer"] 
# t = temp
# func1 = lambda x: 1 if x == True else 0
# t["months_as_customer"] = t["months_as_customer"]  > 24
# t["months_as_customer"]  = t["months_as_customer"] .apply(func1)
# t

1.10.3.3. 对某类别数据进行bin处理#

v = pd.DataFrame({
    'top_state': ["NY", "SC", "WV"],
    'second_state': ["NC", "VA", "chi"], 
    'third_state' : ["PA", "OH", "ttt"]
})

data['big_state'] = data['incident_state'].apply(lambda x: '3' if x in v['top_state'].values else '2' if x in v['second_state'].values 
                                             else '1' if x in v['third_state'].values else '0')
data.drop('incident_state', axis=1, inplace=True)
v = pd.DataFrame({
    'top_city': ["Springfield", "Arlington", "Columbus"],
    'second_city': ["Northbend", "Hillsdale", "chi"], 
    'third_city' : ["Riverwood", "Northbrook", "ttt"]
})

data['big_city'] = data['incident_city'].apply(lambda x: '3' if x in v['top_city'].values else '2' if x in v['second_city'].values 
                                             else '1' if x in v['third_city'].values else '0')

# 处理好以后可以删了
data.drop('incident_city', axis=1, inplace=True)

1.10.3.4. 建立特殊值表, 做label encode#

column_name = []
unique_value = []

for col in data.select_dtypes(include=object).columns:
	column_name.append(col)
	unique_value.append(data[col].nunique())
df = pd.DataFrame()
df['col_name'] =  column_name
df['value'] = unique_value
df = df.sort_values('value', ascending=False)
df
col_name value
11 incident_location 1000
15 auto_model 39
5 insured_hobbies 20
4 insured_occupation 14
14 auto_make 14
3 insured_education_level 7
6 insured_relationship 6
10 authorities_contacted 5
9 incident_severity 4
7 incident_type 4
8 collision_type 4
13 police_report_available 3
16 big_state 3
0 policy_state 3
12 property_damage 3
1 policy_csl 3
17 big_city 3
2 insured_sex 2
temp = pd.DataFrame()
cat_columns = data.select_dtypes(include='O').columns
float_d = data.copy()
cat_l = list(cat_columns)
for i in cat_l:
    float_d.drop(i,axis=1, inplace=True)
from sklearn.preprocessing import LabelEncoder

for col in cat_columns:
	le = LabelEncoder()
	temp[col] = le.fit_transform(data[col])

temp['index'] = range(1, len(temp) + 1)
temp.set_index('index')

1.10.3.5. 爬虫车牌列unique数据的车价,用车价给各位欠款人做个特征#

# 简单爬虫代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait
import time
import datetime
import logging
import random
import openpyxl
import pandas as pd
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

# 配置浏览器
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['car', 'price'])

# 关闭左上方 Chrome 正受到自动测试软件的控制的提示
options = webdriver.ChromeOptions()
options.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option("excludeSwitches", ['enable-automation'])
browser = webdriver.Chrome(executable_path=chrome_driver, options=options)

# 导入车数据
car = pd.read_excel('./车.xlsx')
car = car["car"].tolist()

# 爬国外二手车网站
def foreignWeb(car):
    chrome_driver = r'./win/chromedriver'
    # 关闭左上方 Chrome 正受到自动测试软件的控制的提示
    options = webdriver.ChromeOptions()
    options.add_argument('--incognito')
    options.add_argument('blink-settings=imagesEnabled=false') # 不載入圖片,提升速度
    options.add_argument('User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36')

#     options.add_experimental_option('useAutomationExtension', False)
#     options.add_experimental_option("excludeSwitches", ['enable-automation'])
    browser = webdriver.Chrome(executable_path=chrome_driver, options=options)
    browser.get('https://www.carmax.com/cars?search=BMW')
#     wait = WebDriverWait(browser, 20)
    ## 解决弹窗
#     time.sleep(20)
#     ttype = browser.find_element_by_xpath('//button/parent::div[@class="tour-popover-next-button"]')
#     print(ttype)
    
    xxpe = browser.find_element_by_xpath('//div[contains(text(), "close filters.")]')
    print(xxpe)
    time.sleep(20)
#     browser.quit()

# browser.maximize_window()
# 设定最长等待时间  在10s内发现了输入框已经加载出来后就输入“网易云热评墙”
num = 0
for i in car:
    foreignWeb(i)
    num += 1

# 保存数据  输出日志信息  退出浏览器
wb.save(filename='car_info2.xlsx')
logging.info(f'共获取{num}条信息')
browser.quit()
# 易车网的爬虫代码
def Chineseprocess(car):
    wait = WebDriverWait(browser, 20)
    _input = wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'yccmp-search-input')))
    # # 搜索框中输入内容,输入之前先清空
    # _input.clear()
    # _input.send_keys('Forrestor')
    # # class定位   模拟点击搜文章
    # browser.find_element_by_xpath("//input[@class='yccmp-search-btn']").click()
    # time.sleep(10)

    _input.clear()
    _input.send_keys(car)
    # class定位   模拟点击搜文章
    browser.find_element_by_xpath("//input[@class='yccmp-search-btn']").click()
    
    try:
        elem = browser.find_element_by_xpath('//*[@class="pp-car-list"]/ul')
        all_li = elem.find_elements_by_tag_name("li")
        for li in all_li:
            text = li.text
            sheet.append([car, text])
    except Exception:
        sheet.append([car,""])
    
    time.sleep(5)
browser.get('https://so.yiche.com/chexing/')
# browser.maximize_window()
# 设定最长等待时间  在10s内发现了输入框已经加载出来后就输入“网易云热评墙”
num = 0
for i in car:
    Chineseprocess(i)
    num += 1
# 保存数据  输出日志信息  退出浏览器
wb.save(filename='car_info.xlsx')
logging.info(f'共获取{num}条信息')
browser.quit()

1.10.4. 标准化处理的scratch#

def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        if feature_name == 'model_price' or feature_name == 'policy_number':
            continue
        else:
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result
float_d = normalize(float_d)
float_d['index'] = range(1, len(float_d) + 1)
float_d.set_index('index')
months_as_customer age policy_number policy_deductable policy_annual_premium umbrella_limit insured_zip capital-gains capital-loss incident_hour_of_the_day ... property_claim vehicle_claim auto_year fraud_reported _c39 policy_bind_date_weekday incident_date_weekday p_diff i_diff pi_diff
index
1 0.390397 0.400000 125591 0.333333 0.606303 0.545455 0.109207 0.598010 1.000000 0.913043 ... 0.243768 0.579821 0.25 0.0 NaN 0.500000 0.666667 0.938644 0.254237 0.940601
2 0.507307 0.555556 967713 0.000000 0.232788 0.090909 0.891259 0.330348 1.000000 0.173913 ... 0.434305 0.451755 0.05 0.0 NaN 0.500000 0.166667 0.316914 0.440678 0.318755
3 0.050104 0.311111 649082 0.333333 0.922720 0.090909 0.006146 0.000000 0.593159 0.000000 ... 0.395437 0.411247 0.35 0.0 NaN 0.666667 0.833333 0.239974 0.389831 0.242276
4 0.448852 0.511111 519312 0.000000 0.876860 0.090909 0.028215 0.000000 0.558956 0.869565 ... 0.241234 0.645616 0.40 1.0 NaN 0.166667 0.666667 0.748474 0.610169 0.748477
5 0.177453 0.244444 190588 0.333333 0.224883 0.090909 0.964392 0.720398 0.306931 0.391304 ... 0.452894 0.538558 0.45 0.0 NaN 1.000000 0.666667 0.474390 0.847458 0.473346
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
996 0.064718 0.377778 679370 1.000000 0.548183 0.909091 0.899328 0.000000 0.292529 0.391304 ... 0.019856 0.040508 0.35 NaN NaN 1.000000 0.666667 0.382084 0.491525 0.383486
997 0.620042 0.622222 272330 0.000000 0.733042 0.727273 0.137584 0.000000 0.464446 0.000000 ... 0.234474 0.418040 0.20 NaN NaN 1.000000 0.666667 0.791739 0.254237 0.793951
998 0.524008 0.444444 315631 1.000000 0.494747 0.090909 0.957801 0.000000 0.714671 0.434783 ... 0.233629 0.416530 0.10 NaN NaN 0.666667 0.500000 0.368134 0.118644 0.371954
999 0.112735 0.355556 445195 0.000000 0.512898 0.090909 0.121352 0.681592 0.621062 0.913043 ... 0.303760 0.722732 0.85 NaN NaN 0.000000 0.333333 0.824651 0.932203 0.822454
1000 0.323591 0.333333 914815 0.000000 0.788882 0.090909 0.169629 0.000000 1.000000 0.043478 ... 0.361639 0.752925 0.15 NaN NaN 0.500000 0.333333 0.028553 0.101695 0.033072

1000 rows × 25 columns

data = pd.merge(temp,float_d,on='index')

1.10.5. 训练模型#

# from sklearn.model_selection import RandomizedSearchCV
# import lightgbm as lgb

# rs_params = {

#         'colsample_bytree': (0.5, 0.6, 1),
#         'learning_rate': (0.005, 0.1, 0.2, 0.3),
#         'reg_lambda': (0.25, 0.3, 0.5, 3, 5),
#         'max_depth': (-1, 2, 3, 5, 10),
#         'min_child_samples': (1, 3, 5, 9, 10),
#         'num_leaves': (20, 2**5-1, 2**5-1, 300, 400),
#         'reg_alpha' : (0.1, 0.25, 0.3, 0.5, 3, 5)
    
# }

# # Initialize a RandomizedSearchCV object using 5-fold CV-
# # 折15次,每次用100样本
# rs_cv = RandomizedSearchCV(estimator=lgb.LGBMClassifier(), param_distributions=rs_params, cv = 7, n_iter=300,verbose=1)

# # Train on training data
# rs_cv.fit(train.drop(['fraud_reported'], axis=1), train['fraud_reported'],verbose=1)
# print(rs_cv.best_params_)
# print(rs_cv.best_score_)
# model_lgb = lgb.LGBMClassifier(num_leaves = 300, 
#                                reg_alpha=5, 
#                                reg_lambda=0.5, 
#                                objective='binary', 
#                                max_depth=3, 
#                                learning_rate=0.3, 
#                                min_child_samples=5, 
#                                random_state=7777,n_estimators=2000,subsample=1, colsample_bytree=1,)
# model_lgb.fit(train.drop(['fraud_reported'], axis=1), train['fraud_reported'])
LGBMClassifier(colsample_bytree=1, learning_rate=0.3, max_depth=3,
               min_child_samples=5, n_estimators=2000, num_leaves=300,
               objective='binary', random_state=7777, reg_alpha=5,
               reg_lambda=0.5, subsample=1)
# y_pred = model_lgb.predict_proba(test.drop(['fraud_reported'], axis=1))
# train['fraud_reported'].mean()
0.25857142857142856
# sum(y_pred) / 300
array([0.77069713, 0.22930287])
# result = pd.read_csv('sampleSubmission.csv')
# # 调整矩阵形状
# result['fraud_reported'] = y_pred[:, 1]
# result
policy_number fraud_reported
0 698589 0.587929
1 287489 0.507880
2 211578 0.040526
3 807369 0.045366
4 830878 0.053479
... ... ...
295 679370 0.059729
296 272330 0.101335
297 315631 0.069951
298 445195 0.036823
299 914815 0.080392

300 rows × 2 columns