1.10. 银行欺诈检测比赛#

1.10.1. Initialization#

# 导入包 用于读取文件
import pandas as pd

读数据，合并

# train = pd.read_csv('data/bankfraud_train.csv')
# test = pd.read_csv('data/bankfraud_test.csv')
# 合并train 和 test 集
data = pd.concat([train, test], axis=0)
# 用于其他features文件的合并
# data = pd.merge(data, car_price,on='auto_model', how='outer')

# 声明一些变量
# 美国大城市，用于feature engineering
top_cities = ["NewYork","LosAngeles","Chicago","Houston","Phoenix","Philadelphia","SanAntonio","SanDiego","Dallas","SanJose","Austin","Jacksonville","FortWorth","Columbus","Indianapolis","Charlotte","SanFrancisco","Seattle","Denver","Washington","Nashville","OklahomaCity","ElPaso","Boston","Portland","LasVegas","Detroit","Memphis","Louisville","Baltimore","Milwaukee","Albuquerque","Tucson","Fresno","Sacramento","KansasCity","Mesa","Atlanta","Omaha","ColoradoSprings","Raleigh","LongBeach","VirginiaBeach","Miami","Oakland","Minneapolis","Tulsa","Bakersfield","Wichita","Arlington"]

1.10.2. view the data#

train['incident_city'].value_counts()

Springfield    117
Arlington      110
Columbus       108
Northbend       96
Hillsdale       96
Riverwood       90
Northbrook      83
Name: incident_city, dtype: int64

data['auto_model'].value_counts().count()

# 用于查看数据中非数字列特殊值的个数
for col in data.select_dtypes(include=object).columns:
    # nqunique是一个方法，记得加括号
	print(col, data[col].nunique())

policy_bind_date 951
policy_state 3
policy_csl 3
insured_sex 2
insured_education_level 7
insured_occupation 14
insured_hobbies 20
insured_relationship 6
incident_date 60
incident_type 4
collision_type 4
incident_severity 4
authorities_contacted 5
incident_state 7
incident_city 7
incident_location 1000
property_damage 3
police_report_available 3
auto_make 14
auto_model 39

1.10.3. Feature engineering#

1.10.3.1. 把日期转换成更有意义的特征，列入星期几#

data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], errors='coerce')
data['incident_date'] = pd.to_datetime(data['incident_date'], errors='coerce')

data['policy_bind_date_weekday'] = data['policy_bind_date'].dt.weekday
data['incident_date_weekday'] = data['incident_date'].dt.weekday

base_date = data['policy_bind_date'].min()
data['p_diff'] = (data['policy_bind_date'] - base_date).dt.days
data['i_diff'] = (data['incident_date'] - base_date).dt.days
data.drop(['policy_bind_date', 'incident_date'], axis=1, inplace=True)

# 日期求差值
data['pi_diff'] = data['p_diff'] - data['i_diff']

1.10.3.2. 手动对某个特征做one-hot处理#

# t = pd.read_csv('data/bankfraud_train.csv')
# temp = pd.DataFrame(columns=["months_as_customer"])
# temp = t["months_as_customer"] 
# t = temp
# func1 = lambda x: 1 if x == True else 0
# t["months_as_customer"] = t["months_as_customer"]  > 24
# t["months_as_customer"]  = t["months_as_customer"] .apply(func1)
# t

1.10.3.3. 对某类别数据进行bin处理#

v = pd.DataFrame({
    'top_state': ["NY", "SC", "WV"],
    'second_state': ["NC", "VA", "chi"], 
    'third_state' : ["PA", "OH", "ttt"]
})

data['big_state'] = data['incident_state'].apply(lambda x: '3' if x in v['top_state'].values else '2' if x in v['second_state'].values 
                                             else '1' if x in v['third_state'].values else '0')

data.drop('incident_state', axis=1, inplace=True)

v = pd.DataFrame({
    'top_city': ["Springfield", "Arlington", "Columbus"],
    'second_city': ["Northbend", "Hillsdale", "chi"], 
    'third_city' : ["Riverwood", "Northbrook", "ttt"]
})

data['big_city'] = data['incident_city'].apply(lambda x: '3' if x in v['top_city'].values else '2' if x in v['second_city'].values 
                                             else '1' if x in v['third_city'].values else '0')

# 处理好以后可以删了
data.drop('incident_city', axis=1, inplace=True)

1.10.3.4. 建立特殊值表，做label encode#

column_name = []
unique_value = []

for col in data.select_dtypes(include=object).columns:
	column_name.append(col)
	unique_value.append(data[col].nunique())

df = pd.DataFrame()
df['col_name'] =  column_name
df['value'] = unique_value
df = df.sort_values('value', ascending=False)
df

	col_name	value
11	incident_location	1000
15	auto_model	39
5	insured_hobbies	20
4	insured_occupation	14
14	auto_make	14
3	insured_education_level	7
6	insured_relationship	6
10	authorities_contacted	5
9	incident_severity	4
7	incident_type	4
8	collision_type	4
13	police_report_available	3
16	big_state	3
0	policy_state	3
12	property_damage	3
1	policy_csl	3
17	big_city	3
2	insured_sex	2

temp = pd.DataFrame()
cat_columns = data.select_dtypes(include='O').columns
float_d = data.copy()
cat_l = list(cat_columns)
for i in cat_l:
    float_d.drop(i,axis=1, inplace=True)

from sklearn.preprocessing import LabelEncoder

for col in cat_columns:
	le = LabelEncoder()
	temp[col] = le.fit_transform(data[col])

temp['index'] = range(1, len(temp) + 1)
temp.set_index('index')

1.10.3.5. 爬虫车牌列unique数据的车价，用车价给各位欠款人做个特征#

# 简单爬虫代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait
import time
import datetime
import logging
import random
import openpyxl
import pandas as pd
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

# 配置浏览器
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['car', 'price'])

# 关闭左上方 Chrome 正受到自动测试软件的控制的提示
options = webdriver.ChromeOptions()
options.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option("excludeSwitches", ['enable-automation'])
browser = webdriver.Chrome(executable_path=chrome_driver, options=options)

# 导入车数据
car = pd.read_excel('./车.xlsx')
car = car["car"].tolist()

# 爬国外二手车网站
def foreignWeb(car):
    chrome_driver = r'./win/chromedriver'
    # 关闭左上方 Chrome 正受到自动测试软件的控制的提示
    options = webdriver.ChromeOptions()
    options.add_argument('--incognito')
    options.add_argument('blink-settings=imagesEnabled=false') # 不載入圖片,提升速度
    options.add_argument('User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36')

#     options.add_experimental_option('useAutomationExtension', False)
#     options.add_experimental_option("excludeSwitches", ['enable-automation'])
    browser = webdriver.Chrome(executable_path=chrome_driver, options=options)
    browser.get('https://www.carmax.com/cars?search=BMW')
#     wait = WebDriverWait(browser, 20)
    ## 解决弹窗
#     time.sleep(20)
#     ttype = browser.find_element_by_xpath('//button/parent::div[@class="tour-popover-next-button"]')
#     print(ttype)
    
    xxpe = browser.find_element_by_xpath('//div[contains(text(), "close filters.")]')
    print(xxpe)
    time.sleep(20)
#     browser.quit()

# browser.maximize_window()
# 设定最长等待时间  在10s内发现了输入框已经加载出来后就输入“网易云热评墙”
num = 0
for i in car:
    foreignWeb(i)
    num += 1

# 保存数据  输出日志信息  退出浏览器
wb.save(filename='car_info2.xlsx')
logging.info(f'共获取{num}条信息')
browser.quit()

# 易车网的爬虫代码
def Chineseprocess(car):
    wait = WebDriverWait(browser, 20)
    _input = wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'yccmp-search-input')))
    # # 搜索框中输入内容，输入之前先清空
    # _input.clear()
    # _input.send_keys('Forrestor')
    # # class定位   模拟点击搜文章
    # browser.find_element_by_xpath("//input[@class='yccmp-search-btn']").click()
    # time.sleep(10)

    _input.clear()
    _input.send_keys(car)
    # class定位   模拟点击搜文章
    browser.find_element_by_xpath("//input[@class='yccmp-search-btn']").click()
    
    try:
        elem = browser.find_element_by_xpath('//*[@class="pp-car-list"]/ul')
        all_li = elem.find_elements_by_tag_name("li")
        for li in all_li:
            text = li.text
            sheet.append([car, text])
    except Exception:
        sheet.append([car,""])
    
    time.sleep(5)

browser.get('https://so.yiche.com/chexing/')
# browser.maximize_window()
# 设定最长等待时间  在10s内发现了输入框已经加载出来后就输入“网易云热评墙”
num = 0
for i in car:
    Chineseprocess(i)
    num += 1

# 保存数据  输出日志信息  退出浏览器
wb.save(filename='car_info.xlsx')
logging.info(f'共获取{num}条信息')
browser.quit()

1.10.4. 标准化处理的scratch#

def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        if feature_name == 'model_price' or feature_name == 'policy_number':
            continue
        else:
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

float_d = normalize(float_d)
float_d['index'] = range(1, len(float_d) + 1)
float_d.set_index('index')

	months_as_customer	age	policy_number	policy_deductable	policy_annual_premium	umbrella_limit	insured_zip	capital-gains	capital-loss	incident_hour_of_the_day	...	property_claim	vehicle_claim	auto_year	fraud_reported	_c39	policy_bind_date_weekday	incident_date_weekday	p_diff	i_diff	pi_diff
index
1	0.390397	0.400000	125591	0.333333	0.606303	0.545455	0.109207	0.598010	1.000000	0.913043	...	0.243768	0.579821	0.25	0.0	NaN	0.500000	0.666667	0.938644	0.254237	0.940601
2	0.507307	0.555556	967713	0.000000	0.232788	0.090909	0.891259	0.330348	1.000000	0.173913	...	0.434305	0.451755	0.05	0.0	NaN	0.500000	0.166667	0.316914	0.440678	0.318755
3	0.050104	0.311111	649082	0.333333	0.922720	0.090909	0.006146	0.000000	0.593159	0.000000	...	0.395437	0.411247	0.35	0.0	NaN	0.666667	0.833333	0.239974	0.389831	0.242276
4	0.448852	0.511111	519312	0.000000	0.876860	0.090909	0.028215	0.000000	0.558956	0.869565	...	0.241234	0.645616	0.40	1.0	NaN	0.166667	0.666667	0.748474	0.610169	0.748477
5	0.177453	0.244444	190588	0.333333	0.224883	0.090909	0.964392	0.720398	0.306931	0.391304	...	0.452894	0.538558	0.45	0.0	NaN	1.000000	0.666667	0.474390	0.847458	0.473346
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
996	0.064718	0.377778	679370	1.000000	0.548183	0.909091	0.899328	0.000000	0.292529	0.391304	...	0.019856	0.040508	0.35	NaN	NaN	1.000000	0.666667	0.382084	0.491525	0.383486
997	0.620042	0.622222	272330	0.000000	0.733042	0.727273	0.137584	0.000000	0.464446	0.000000	...	0.234474	0.418040	0.20	NaN	NaN	1.000000	0.666667	0.791739	0.254237	0.793951
998	0.524008	0.444444	315631	1.000000	0.494747	0.090909	0.957801	0.000000	0.714671	0.434783	...	0.233629	0.416530	0.10	NaN	NaN	0.666667	0.500000	0.368134	0.118644	0.371954
999	0.112735	0.355556	445195	0.000000	0.512898	0.090909	0.121352	0.681592	0.621062	0.913043	...	0.303760	0.722732	0.85	NaN	NaN	0.000000	0.333333	0.824651	0.932203	0.822454
1000	0.323591	0.333333	914815	0.000000	0.788882	0.090909	0.169629	0.000000	1.000000	0.043478	...	0.361639	0.752925	0.15	NaN	NaN	0.500000	0.333333	0.028553	0.101695	0.033072

1000 rows × 25 columns

data = pd.merge(temp,float_d,on='index')

1.10.5. 训练模型#

# from sklearn.model_selection import RandomizedSearchCV
# import lightgbm as lgb

# rs_params = {

#         'colsample_bytree': (0.5, 0.6, 1),
#         'learning_rate': (0.005, 0.1, 0.2, 0.3),
#         'reg_lambda': (0.25, 0.3, 0.5, 3, 5),
#         'max_depth': (-1, 2, 3, 5, 10),
#         'min_child_samples': (1, 3, 5, 9, 10),
#         'num_leaves': (20, 2**5-1, 2**5-1, 300, 400),
#         'reg_alpha' : (0.1, 0.25, 0.3, 0.5, 3, 5)
    
# }

# # Initialize a RandomizedSearchCV object using 5-fold CV-
# # 折15次，每次用100样本
# rs_cv = RandomizedSearchCV(estimator=lgb.LGBMClassifier(), param_distributions=rs_params, cv = 7, n_iter=300,verbose=1)

# # Train on training data
# rs_cv.fit(train.drop(['fraud_reported'], axis=1), train['fraud_reported'],verbose=1)
# print(rs_cv.best_params_)
# print(rs_cv.best_score_)

# model_lgb = lgb.LGBMClassifier(num_leaves = 300, 
#                                reg_alpha=5, 
#                                reg_lambda=0.5, 
#                                objective='binary', 
#                                max_depth=3, 
#                                learning_rate=0.3, 
#                                min_child_samples=5, 
#                                random_state=7777,n_estimators=2000,subsample=1, colsample_bytree=1,)

# model_lgb.fit(train.drop(['fraud_reported'], axis=1), train['fraud_reported'])

LGBMClassifier(colsample_bytree=1, learning_rate=0.3, max_depth=3,
               min_child_samples=5, n_estimators=2000, num_leaves=300,
               objective='binary', random_state=7777, reg_alpha=5,
               reg_lambda=0.5, subsample=1)

# y_pred = model_lgb.predict_proba(test.drop(['fraud_reported'], axis=1))

# train['fraud_reported'].mean()

0.25857142857142856

# sum(y_pred) / 300

array([0.77069713, 0.22930287])

# result = pd.read_csv('sampleSubmission.csv')
# # 调整矩阵形状
# result['fraud_reported'] = y_pred[:, 1]

# result

	policy_number	fraud_reported
0	698589	0.587929
1	287489	0.507880
2	211578	0.040526
3	807369	0.045366
4	830878	0.053479
...	...	...
295	679370	0.059729
296	272330	0.101335
297	315631	0.069951
298	445195	0.036823
299	914815	0.080392

300 rows × 2 columns

Chenoi AI Lab

银行欺诈检测比赛

Contents

1.10. 银行欺诈检测比赛#

1.10.1. Initialization#

1.10.2. view the data#

1.10.3. Feature engineering#

1.10.3.1. 把日期转换成更有意义的特征，列入星期几#

1.10.3.2. 手动对某个特征做one-hot处理#

1.10.3.3. 对某类别数据进行bin处理#

1.10.3.4. 建立特殊值表，做label encode#

1.10.3.5. 爬虫车牌列unique数据的车价，用车价给各位欠款人做个特征#

1.10.4. 标准化处理的scratch#

1.10.5. 训练模型#

Chenoi AI Lab

银行欺诈检测比赛

Contents

1.10. 银行欺诈检测比赛#

1.10.1. Initialization#

1.10.2. view the data#

1.10.3. Feature engineering#

1.10.3.1. 把日期转换成更有意义的特征，列入星期几#

1.10.3.2. 手动对某个特征做one-hot处理#

1.10.3.3. 对某类别数据进行bin处理#

1.10.3.4. 建立特殊值表， 做label encode#

1.10.3.5. 爬虫车牌列unique数据的车价，用车价给各位欠款人做个特征#

1.10.4. 标准化处理的scratch#

1.10.5. 训练模型#

1.10.3.4. 建立特殊值表，做label encode#