银行欺诈检测比赛
Contents
1.10. 银行欺诈检测比赛#
PDF请访问银行欺诈检测.pdf。
1.10.1. Initialization#
# 导入包 用于读取文件
import pandas as pd
读数据,合并
# train = pd.read_csv('data/bankfraud_train.csv')
# test = pd.read_csv('data/bankfraud_test.csv')
# 合并train 和 test 集
data = pd.concat([train, test], axis=0)
# 用于其他features文件的合并
# data = pd.merge(data, car_price,on='auto_model', how='outer')
# 声明一些变量
# 美国大城市,用于feature engineering
top_cities = ["NewYork","LosAngeles","Chicago","Houston","Phoenix","Philadelphia","SanAntonio","SanDiego","Dallas","SanJose","Austin","Jacksonville","FortWorth","Columbus","Indianapolis","Charlotte","SanFrancisco","Seattle","Denver","Washington","Nashville","OklahomaCity","ElPaso","Boston","Portland","LasVegas","Detroit","Memphis","Louisville","Baltimore","Milwaukee","Albuquerque","Tucson","Fresno","Sacramento","KansasCity","Mesa","Atlanta","Omaha","ColoradoSprings","Raleigh","LongBeach","VirginiaBeach","Miami","Oakland","Minneapolis","Tulsa","Bakersfield","Wichita","Arlington"]
1.10.2. view the data#
train['incident_city'].value_counts()
Springfield 117
Arlington 110
Columbus 108
Northbend 96
Hillsdale 96
Riverwood 90
Northbrook 83
Name: incident_city, dtype: int64
data['auto_model'].value_counts().count()
39
# 用于查看数据中非数字列特殊值的个数
for col in data.select_dtypes(include=object).columns:
# nqunique是一个方法,记得加括号
print(col, data[col].nunique())
policy_bind_date 951
policy_state 3
policy_csl 3
insured_sex 2
insured_education_level 7
insured_occupation 14
insured_hobbies 20
insured_relationship 6
incident_date 60
incident_type 4
collision_type 4
incident_severity 4
authorities_contacted 5
incident_state 7
incident_city 7
incident_location 1000
property_damage 3
police_report_available 3
auto_make 14
auto_model 39
1.10.3. Feature engineering#
1.10.3.1. 把日期转换成更有意义的特征,列入星期几#
data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], errors='coerce')
data['incident_date'] = pd.to_datetime(data['incident_date'], errors='coerce')
data['policy_bind_date_weekday'] = data['policy_bind_date'].dt.weekday
data['incident_date_weekday'] = data['incident_date'].dt.weekday
base_date = data['policy_bind_date'].min()
data['p_diff'] = (data['policy_bind_date'] - base_date).dt.days
data['i_diff'] = (data['incident_date'] - base_date).dt.days
data.drop(['policy_bind_date', 'incident_date'], axis=1, inplace=True)
# 日期求差值
data['pi_diff'] = data['p_diff'] - data['i_diff']
1.10.3.2. 手动对某个特征做one-hot处理#
# t = pd.read_csv('data/bankfraud_train.csv')
# temp = pd.DataFrame(columns=["months_as_customer"])
# temp = t["months_as_customer"]
# t = temp
# func1 = lambda x: 1 if x == True else 0
# t["months_as_customer"] = t["months_as_customer"] > 24
# t["months_as_customer"] = t["months_as_customer"] .apply(func1)
# t
1.10.3.3. 对某类别数据进行bin处理#
v = pd.DataFrame({
'top_state': ["NY", "SC", "WV"],
'second_state': ["NC", "VA", "chi"],
'third_state' : ["PA", "OH", "ttt"]
})
data['big_state'] = data['incident_state'].apply(lambda x: '3' if x in v['top_state'].values else '2' if x in v['second_state'].values
else '1' if x in v['third_state'].values else '0')
data.drop('incident_state', axis=1, inplace=True)
v = pd.DataFrame({
'top_city': ["Springfield", "Arlington", "Columbus"],
'second_city': ["Northbend", "Hillsdale", "chi"],
'third_city' : ["Riverwood", "Northbrook", "ttt"]
})
data['big_city'] = data['incident_city'].apply(lambda x: '3' if x in v['top_city'].values else '2' if x in v['second_city'].values
else '1' if x in v['third_city'].values else '0')
# 处理好以后可以删了
data.drop('incident_city', axis=1, inplace=True)
1.10.3.4. 建立特殊值表, 做label encode#
column_name = []
unique_value = []
for col in data.select_dtypes(include=object).columns:
column_name.append(col)
unique_value.append(data[col].nunique())
df = pd.DataFrame()
df['col_name'] = column_name
df['value'] = unique_value
df = df.sort_values('value', ascending=False)
df
col_name | value | |
---|---|---|
11 | incident_location | 1000 |
15 | auto_model | 39 |
5 | insured_hobbies | 20 |
4 | insured_occupation | 14 |
14 | auto_make | 14 |
3 | insured_education_level | 7 |
6 | insured_relationship | 6 |
10 | authorities_contacted | 5 |
9 | incident_severity | 4 |
7 | incident_type | 4 |
8 | collision_type | 4 |
13 | police_report_available | 3 |
16 | big_state | 3 |
0 | policy_state | 3 |
12 | property_damage | 3 |
1 | policy_csl | 3 |
17 | big_city | 3 |
2 | insured_sex | 2 |
temp = pd.DataFrame()
cat_columns = data.select_dtypes(include='O').columns
float_d = data.copy()
cat_l = list(cat_columns)
for i in cat_l:
float_d.drop(i,axis=1, inplace=True)
from sklearn.preprocessing import LabelEncoder
for col in cat_columns:
le = LabelEncoder()
temp[col] = le.fit_transform(data[col])
temp['index'] = range(1, len(temp) + 1)
temp.set_index('index')
1.10.3.5. 爬虫车牌列unique数据的车价,用车价给各位欠款人做个特征#
# 简单爬虫代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait
import time
import datetime
import logging
import random
import openpyxl
import pandas as pd
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# 配置浏览器
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['car', 'price'])
# 关闭左上方 Chrome 正受到自动测试软件的控制的提示
options = webdriver.ChromeOptions()
options.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option("excludeSwitches", ['enable-automation'])
browser = webdriver.Chrome(executable_path=chrome_driver, options=options)
# 导入车数据
car = pd.read_excel('./车.xlsx')
car = car["car"].tolist()
# 爬国外二手车网站
def foreignWeb(car):
chrome_driver = r'./win/chromedriver'
# 关闭左上方 Chrome 正受到自动测试软件的控制的提示
options = webdriver.ChromeOptions()
options.add_argument('--incognito')
options.add_argument('blink-settings=imagesEnabled=false') # 不載入圖片,提升速度
options.add_argument('User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36')
# options.add_experimental_option('useAutomationExtension', False)
# options.add_experimental_option("excludeSwitches", ['enable-automation'])
browser = webdriver.Chrome(executable_path=chrome_driver, options=options)
browser.get('https://www.carmax.com/cars?search=BMW')
# wait = WebDriverWait(browser, 20)
## 解决弹窗
# time.sleep(20)
# ttype = browser.find_element_by_xpath('//button/parent::div[@class="tour-popover-next-button"]')
# print(ttype)
xxpe = browser.find_element_by_xpath('//div[contains(text(), "close filters.")]')
print(xxpe)
time.sleep(20)
# browser.quit()
# browser.maximize_window()
# 设定最长等待时间 在10s内发现了输入框已经加载出来后就输入“网易云热评墙”
num = 0
for i in car:
foreignWeb(i)
num += 1
# 保存数据 输出日志信息 退出浏览器
wb.save(filename='car_info2.xlsx')
logging.info(f'共获取{num}条信息')
browser.quit()
# 易车网的爬虫代码
def Chineseprocess(car):
wait = WebDriverWait(browser, 20)
_input = wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'yccmp-search-input')))
# # 搜索框中输入内容,输入之前先清空
# _input.clear()
# _input.send_keys('Forrestor')
# # class定位 模拟点击搜文章
# browser.find_element_by_xpath("//input[@class='yccmp-search-btn']").click()
# time.sleep(10)
_input.clear()
_input.send_keys(car)
# class定位 模拟点击搜文章
browser.find_element_by_xpath("//input[@class='yccmp-search-btn']").click()
try:
elem = browser.find_element_by_xpath('//*[@class="pp-car-list"]/ul')
all_li = elem.find_elements_by_tag_name("li")
for li in all_li:
text = li.text
sheet.append([car, text])
except Exception:
sheet.append([car,""])
time.sleep(5)
browser.get('https://so.yiche.com/chexing/')
# browser.maximize_window()
# 设定最长等待时间 在10s内发现了输入框已经加载出来后就输入“网易云热评墙”
num = 0
for i in car:
Chineseprocess(i)
num += 1
# 保存数据 输出日志信息 退出浏览器
wb.save(filename='car_info.xlsx')
logging.info(f'共获取{num}条信息')
browser.quit()
1.10.4. 标准化处理的scratch#
def normalize(df):
result = df.copy()
for feature_name in df.columns:
if feature_name == 'model_price' or feature_name == 'policy_number':
continue
else:
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result
float_d = normalize(float_d)
float_d['index'] = range(1, len(float_d) + 1)
float_d.set_index('index')
months_as_customer | age | policy_number | policy_deductable | policy_annual_premium | umbrella_limit | insured_zip | capital-gains | capital-loss | incident_hour_of_the_day | ... | property_claim | vehicle_claim | auto_year | fraud_reported | _c39 | policy_bind_date_weekday | incident_date_weekday | p_diff | i_diff | pi_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
1 | 0.390397 | 0.400000 | 125591 | 0.333333 | 0.606303 | 0.545455 | 0.109207 | 0.598010 | 1.000000 | 0.913043 | ... | 0.243768 | 0.579821 | 0.25 | 0.0 | NaN | 0.500000 | 0.666667 | 0.938644 | 0.254237 | 0.940601 |
2 | 0.507307 | 0.555556 | 967713 | 0.000000 | 0.232788 | 0.090909 | 0.891259 | 0.330348 | 1.000000 | 0.173913 | ... | 0.434305 | 0.451755 | 0.05 | 0.0 | NaN | 0.500000 | 0.166667 | 0.316914 | 0.440678 | 0.318755 |
3 | 0.050104 | 0.311111 | 649082 | 0.333333 | 0.922720 | 0.090909 | 0.006146 | 0.000000 | 0.593159 | 0.000000 | ... | 0.395437 | 0.411247 | 0.35 | 0.0 | NaN | 0.666667 | 0.833333 | 0.239974 | 0.389831 | 0.242276 |
4 | 0.448852 | 0.511111 | 519312 | 0.000000 | 0.876860 | 0.090909 | 0.028215 | 0.000000 | 0.558956 | 0.869565 | ... | 0.241234 | 0.645616 | 0.40 | 1.0 | NaN | 0.166667 | 0.666667 | 0.748474 | 0.610169 | 0.748477 |
5 | 0.177453 | 0.244444 | 190588 | 0.333333 | 0.224883 | 0.090909 | 0.964392 | 0.720398 | 0.306931 | 0.391304 | ... | 0.452894 | 0.538558 | 0.45 | 0.0 | NaN | 1.000000 | 0.666667 | 0.474390 | 0.847458 | 0.473346 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
996 | 0.064718 | 0.377778 | 679370 | 1.000000 | 0.548183 | 0.909091 | 0.899328 | 0.000000 | 0.292529 | 0.391304 | ... | 0.019856 | 0.040508 | 0.35 | NaN | NaN | 1.000000 | 0.666667 | 0.382084 | 0.491525 | 0.383486 |
997 | 0.620042 | 0.622222 | 272330 | 0.000000 | 0.733042 | 0.727273 | 0.137584 | 0.000000 | 0.464446 | 0.000000 | ... | 0.234474 | 0.418040 | 0.20 | NaN | NaN | 1.000000 | 0.666667 | 0.791739 | 0.254237 | 0.793951 |
998 | 0.524008 | 0.444444 | 315631 | 1.000000 | 0.494747 | 0.090909 | 0.957801 | 0.000000 | 0.714671 | 0.434783 | ... | 0.233629 | 0.416530 | 0.10 | NaN | NaN | 0.666667 | 0.500000 | 0.368134 | 0.118644 | 0.371954 |
999 | 0.112735 | 0.355556 | 445195 | 0.000000 | 0.512898 | 0.090909 | 0.121352 | 0.681592 | 0.621062 | 0.913043 | ... | 0.303760 | 0.722732 | 0.85 | NaN | NaN | 0.000000 | 0.333333 | 0.824651 | 0.932203 | 0.822454 |
1000 | 0.323591 | 0.333333 | 914815 | 0.000000 | 0.788882 | 0.090909 | 0.169629 | 0.000000 | 1.000000 | 0.043478 | ... | 0.361639 | 0.752925 | 0.15 | NaN | NaN | 0.500000 | 0.333333 | 0.028553 | 0.101695 | 0.033072 |
1000 rows × 25 columns
data = pd.merge(temp,float_d,on='index')
1.10.5. 训练模型#
# from sklearn.model_selection import RandomizedSearchCV
# import lightgbm as lgb
# rs_params = {
# 'colsample_bytree': (0.5, 0.6, 1),
# 'learning_rate': (0.005, 0.1, 0.2, 0.3),
# 'reg_lambda': (0.25, 0.3, 0.5, 3, 5),
# 'max_depth': (-1, 2, 3, 5, 10),
# 'min_child_samples': (1, 3, 5, 9, 10),
# 'num_leaves': (20, 2**5-1, 2**5-1, 300, 400),
# 'reg_alpha' : (0.1, 0.25, 0.3, 0.5, 3, 5)
# }
# # Initialize a RandomizedSearchCV object using 5-fold CV-
# # 折15次,每次用100样本
# rs_cv = RandomizedSearchCV(estimator=lgb.LGBMClassifier(), param_distributions=rs_params, cv = 7, n_iter=300,verbose=1)
# # Train on training data
# rs_cv.fit(train.drop(['fraud_reported'], axis=1), train['fraud_reported'],verbose=1)
# print(rs_cv.best_params_)
# print(rs_cv.best_score_)
# model_lgb = lgb.LGBMClassifier(num_leaves = 300,
# reg_alpha=5,
# reg_lambda=0.5,
# objective='binary',
# max_depth=3,
# learning_rate=0.3,
# min_child_samples=5,
# random_state=7777,n_estimators=2000,subsample=1, colsample_bytree=1,)
# model_lgb.fit(train.drop(['fraud_reported'], axis=1), train['fraud_reported'])
LGBMClassifier(colsample_bytree=1, learning_rate=0.3, max_depth=3,
min_child_samples=5, n_estimators=2000, num_leaves=300,
objective='binary', random_state=7777, reg_alpha=5,
reg_lambda=0.5, subsample=1)
# y_pred = model_lgb.predict_proba(test.drop(['fraud_reported'], axis=1))
# train['fraud_reported'].mean()
0.25857142857142856
# sum(y_pred) / 300
array([0.77069713, 0.22930287])
# result = pd.read_csv('sampleSubmission.csv')
# # 调整矩阵形状
# result['fraud_reported'] = y_pred[:, 1]
# result
policy_number | fraud_reported | |
---|---|---|
0 | 698589 | 0.587929 |
1 | 287489 | 0.507880 |
2 | 211578 | 0.040526 |
3 | 807369 | 0.045366 |
4 | 830878 | 0.053479 |
... | ... | ... |
295 | 679370 | 0.059729 |
296 | 272330 | 0.101335 |
297 | 315631 | 0.069951 |
298 | 445195 | 0.036823 |
299 | 914815 | 0.080392 |
300 rows × 2 columns