{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "(bank_AI)=\n", "# 银行欺诈检测比赛" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "PDF请访问[银行欺诈检测.pdf](https://www.dropbox.com/s/n2jt4tkuvnlkomg/L1%20AI%E5%A4%A7%E8%B5%9B%E4%BF%A1%E6%81%AF%E4%BB%8B%E7%BB%8DV1.5.pptx?dl=0)。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initialization " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# 导入包 用于读取文件\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "读数据,合并" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# train = pd.read_csv('data/bankfraud_train.csv')\n", "# test = pd.read_csv('data/bankfraud_test.csv')\n", "# 合并train 和 test 集\n", "data = pd.concat([train, test], axis=0)\n", "# 用于其他features文件的合并\n", "# data = pd.merge(data, car_price,on='auto_model', how='outer')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# 声明一些变量\n", "# 美国大城市,用于feature engineering\n", "top_cities = [\"NewYork\",\"LosAngeles\",\"Chicago\",\"Houston\",\"Phoenix\",\"Philadelphia\",\"SanAntonio\",\"SanDiego\",\"Dallas\",\"SanJose\",\"Austin\",\"Jacksonville\",\"FortWorth\",\"Columbus\",\"Indianapolis\",\"Charlotte\",\"SanFrancisco\",\"Seattle\",\"Denver\",\"Washington\",\"Nashville\",\"OklahomaCity\",\"ElPaso\",\"Boston\",\"Portland\",\"LasVegas\",\"Detroit\",\"Memphis\",\"Louisville\",\"Baltimore\",\"Milwaukee\",\"Albuquerque\",\"Tucson\",\"Fresno\",\"Sacramento\",\"KansasCity\",\"Mesa\",\"Atlanta\",\"Omaha\",\"ColoradoSprings\",\"Raleigh\",\"LongBeach\",\"VirginiaBeach\",\"Miami\",\"Oakland\",\"Minneapolis\",\"Tulsa\",\"Bakersfield\",\"Wichita\",\"Arlington\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## view the data" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Springfield 117\n", "Arlington 110\n", "Columbus 108\n", "Northbend 96\n", "Hillsdale 96\n", "Riverwood 90\n", "Northbrook 83\n", "Name: incident_city, dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train['incident_city'].value_counts()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "39" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['auto_model'].value_counts().count()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "policy_bind_date 951\n", "policy_state 3\n", "policy_csl 3\n", "insured_sex 2\n", "insured_education_level 7\n", "insured_occupation 14\n", "insured_hobbies 20\n", "insured_relationship 6\n", "incident_date 60\n", "incident_type 4\n", "collision_type 4\n", "incident_severity 4\n", "authorities_contacted 5\n", "incident_state 7\n", "incident_city 7\n", "incident_location 1000\n", "property_damage 3\n", "police_report_available 3\n", "auto_make 14\n", "auto_model 39\n" ] } ], "source": [ "# 用于查看数据中非数字列特殊值的个数\n", "for col in data.select_dtypes(include=object).columns:\n", " # nqunique是一个方法,记得加括号\n", "\tprint(col, data[col].nunique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature engineering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 把日期转换成更有意义的特征,列入星期几" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], errors='coerce')\n", "data['incident_date'] = pd.to_datetime(data['incident_date'], errors='coerce')" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "data['policy_bind_date_weekday'] = data['policy_bind_date'].dt.weekday\n", "data['incident_date_weekday'] = data['incident_date'].dt.weekday" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "base_date = data['policy_bind_date'].min()\n", "data['p_diff'] = (data['policy_bind_date'] - base_date).dt.days\n", "data['i_diff'] = (data['incident_date'] - base_date).dt.days\n", "data.drop(['policy_bind_date', 'incident_date'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# 日期求差值\n", "data['pi_diff'] = data['p_diff'] - data['i_diff']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 手动对某个特征做one-hot处理\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# t = pd.read_csv('data/bankfraud_train.csv')\n", "# temp = pd.DataFrame(columns=[\"months_as_customer\"])\n", "# temp = t[\"months_as_customer\"] \n", "# t = temp\n", "# func1 = lambda x: 1 if x == True else 0\n", "# t[\"months_as_customer\"] = t[\"months_as_customer\"] > 24\n", "# t[\"months_as_customer\"] = t[\"months_as_customer\"] .apply(func1)\n", "# t" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 对某类别数据进行bin处理" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "v = pd.DataFrame({\n", " 'top_state': [\"NY\", \"SC\", \"WV\"],\n", " 'second_state': [\"NC\", \"VA\", \"chi\"], \n", " 'third_state' : [\"PA\", \"OH\", \"ttt\"]\n", "})\n", "\n", "data['big_state'] = data['incident_state'].apply(lambda x: '3' if x in v['top_state'].values else '2' if x in v['second_state'].values \n", " else '1' if x in v['third_state'].values else '0')" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "data.drop('incident_state', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "v = pd.DataFrame({\n", " 'top_city': [\"Springfield\", \"Arlington\", \"Columbus\"],\n", " 'second_city': [\"Northbend\", \"Hillsdale\", \"chi\"], \n", " 'third_city' : [\"Riverwood\", \"Northbrook\", \"ttt\"]\n", "})\n", "\n", "data['big_city'] = data['incident_city'].apply(lambda x: '3' if x in v['top_city'].values else '2' if x in v['second_city'].values \n", " else '1' if x in v['third_city'].values else '0')\n", "\n", "# 处理好以后可以删了\n", "data.drop('incident_city', axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 建立特殊值表, 做label encode" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "column_name = []\n", "unique_value = []\n", "\n", "for col in data.select_dtypes(include=object).columns:\n", "\tcolumn_name.append(col)\n", "\tunique_value.append(data[col].nunique())" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | col_name | \n", "value | \n", "
---|---|---|
11 | \n", "incident_location | \n", "1000 | \n", "
15 | \n", "auto_model | \n", "39 | \n", "
5 | \n", "insured_hobbies | \n", "20 | \n", "
4 | \n", "insured_occupation | \n", "14 | \n", "
14 | \n", "auto_make | \n", "14 | \n", "
3 | \n", "insured_education_level | \n", "7 | \n", "
6 | \n", "insured_relationship | \n", "6 | \n", "
10 | \n", "authorities_contacted | \n", "5 | \n", "
9 | \n", "incident_severity | \n", "4 | \n", "
7 | \n", "incident_type | \n", "4 | \n", "
8 | \n", "collision_type | \n", "4 | \n", "
13 | \n", "police_report_available | \n", "3 | \n", "
16 | \n", "big_state | \n", "3 | \n", "
0 | \n", "policy_state | \n", "3 | \n", "
12 | \n", "property_damage | \n", "3 | \n", "
1 | \n", "policy_csl | \n", "3 | \n", "
17 | \n", "big_city | \n", "3 | \n", "
2 | \n", "insured_sex | \n", "2 | \n", "
\n", " | policy_state | \n", "policy_csl | \n", "insured_sex | \n", "insured_education_level | \n", "insured_occupation | \n", "insured_hobbies | \n", "insured_relationship | \n", "incident_type | \n", "collision_type | \n", "incident_severity | \n", "authorities_contacted | \n", "incident_location | \n", "property_damage | \n", "police_report_available | \n", "auto_make | \n", "auto_model | \n", "big_state | \n", "big_city | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "1 | \n", "2 | \n", "0 | \n", "5 | \n", "10 | \n", "15 | \n", "1 | \n", "2 | \n", "3 | \n", "2 | \n", "0 | \n", "680 | \n", "0 | \n", "0 | \n", "9 | \n", "26 | \n", "1 | \n", "0 | \n", "
2 | \n", "0 | \n", "1 | \n", "1 | \n", "3 | \n", "2 | \n", "14 | \n", "2 | \n", "0 | \n", "3 | \n", "1 | \n", "3 | \n", "935 | \n", "0 | \n", "2 | \n", "6 | \n", "10 | \n", "1 | \n", "2 | \n", "
3 | \n", "0 | \n", "2 | \n", "0 | \n", "2 | \n", "6 | \n", "16 | \n", "5 | \n", "2 | \n", "3 | \n", "2 | \n", "4 | \n", "456 | \n", "0 | \n", "1 | \n", "7 | \n", "36 | \n", "2 | \n", "1 | \n", "
4 | \n", "2 | \n", "2 | \n", "1 | \n", "3 | \n", "13 | \n", "18 | \n", "3 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "157 | \n", "2 | \n", "2 | \n", "11 | \n", "21 | \n", "2 | \n", "1 | \n", "
5 | \n", "2 | \n", "0 | \n", "0 | \n", "4 | \n", "2 | \n", "18 | \n", "3 | \n", "0 | \n", "2 | \n", "2 | \n", "1 | \n", "141 | \n", "2 | \n", "2 | \n", "5 | \n", "14 | \n", "2 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
996 | \n", "0 | \n", "2 | \n", "0 | \n", "1 | \n", "9 | \n", "11 | \n", "1 | \n", "1 | \n", "0 | \n", "3 | \n", "2 | \n", "503 | \n", "1 | \n", "2 | \n", "4 | \n", "27 | \n", "2 | \n", "2 | \n", "
997 | \n", "1 | \n", "1 | \n", "1 | \n", "4 | \n", "0 | \n", "12 | \n", "4 | \n", "0 | \n", "3 | \n", "2 | \n", "1 | \n", "3 | \n", "0 | \n", "2 | \n", "7 | \n", "17 | \n", "2 | \n", "2 | \n", "
998 | \n", "1 | \n", "2 | \n", "0 | \n", "0 | \n", "7 | \n", "10 | \n", "1 | \n", "2 | \n", "2 | \n", "1 | \n", "4 | \n", "40 | \n", "2 | \n", "0 | \n", "4 | \n", "30 | \n", "2 | \n", "1 | \n", "
999 | \n", "1 | \n", "0 | \n", "0 | \n", "4 | \n", "10 | \n", "10 | \n", "4 | \n", "0 | \n", "3 | \n", "1 | \n", "1 | \n", "452 | \n", "2 | \n", "0 | \n", "9 | \n", "26 | \n", "1 | \n", "2 | \n", "
1000 | \n", "1 | \n", "0 | \n", "1 | \n", "5 | \n", "10 | \n", "7 | \n", "2 | \n", "2 | \n", "1 | \n", "1 | \n", "0 | \n", "966 | \n", "1 | \n", "2 | \n", "6 | \n", "10 | \n", "0 | \n", "1 | \n", "
1000 rows × 18 columns
\n", "\n", " | months_as_customer | \n", "age | \n", "policy_number | \n", "policy_deductable | \n", "policy_annual_premium | \n", "umbrella_limit | \n", "insured_zip | \n", "capital-gains | \n", "capital-loss | \n", "incident_hour_of_the_day | \n", "... | \n", "property_claim | \n", "vehicle_claim | \n", "auto_year | \n", "fraud_reported | \n", "_c39 | \n", "policy_bind_date_weekday | \n", "incident_date_weekday | \n", "p_diff | \n", "i_diff | \n", "pi_diff | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "0.390397 | \n", "0.400000 | \n", "125591 | \n", "0.333333 | \n", "0.606303 | \n", "0.545455 | \n", "0.109207 | \n", "0.598010 | \n", "1.000000 | \n", "0.913043 | \n", "... | \n", "0.243768 | \n", "0.579821 | \n", "0.25 | \n", "0.0 | \n", "NaN | \n", "0.500000 | \n", "0.666667 | \n", "0.938644 | \n", "0.254237 | \n", "0.940601 | \n", "
2 | \n", "0.507307 | \n", "0.555556 | \n", "967713 | \n", "0.000000 | \n", "0.232788 | \n", "0.090909 | \n", "0.891259 | \n", "0.330348 | \n", "1.000000 | \n", "0.173913 | \n", "... | \n", "0.434305 | \n", "0.451755 | \n", "0.05 | \n", "0.0 | \n", "NaN | \n", "0.500000 | \n", "0.166667 | \n", "0.316914 | \n", "0.440678 | \n", "0.318755 | \n", "
3 | \n", "0.050104 | \n", "0.311111 | \n", "649082 | \n", "0.333333 | \n", "0.922720 | \n", "0.090909 | \n", "0.006146 | \n", "0.000000 | \n", "0.593159 | \n", "0.000000 | \n", "... | \n", "0.395437 | \n", "0.411247 | \n", "0.35 | \n", "0.0 | \n", "NaN | \n", "0.666667 | \n", "0.833333 | \n", "0.239974 | \n", "0.389831 | \n", "0.242276 | \n", "
4 | \n", "0.448852 | \n", "0.511111 | \n", "519312 | \n", "0.000000 | \n", "0.876860 | \n", "0.090909 | \n", "0.028215 | \n", "0.000000 | \n", "0.558956 | \n", "0.869565 | \n", "... | \n", "0.241234 | \n", "0.645616 | \n", "0.40 | \n", "1.0 | \n", "NaN | \n", "0.166667 | \n", "0.666667 | \n", "0.748474 | \n", "0.610169 | \n", "0.748477 | \n", "
5 | \n", "0.177453 | \n", "0.244444 | \n", "190588 | \n", "0.333333 | \n", "0.224883 | \n", "0.090909 | \n", "0.964392 | \n", "0.720398 | \n", "0.306931 | \n", "0.391304 | \n", "... | \n", "0.452894 | \n", "0.538558 | \n", "0.45 | \n", "0.0 | \n", "NaN | \n", "1.000000 | \n", "0.666667 | \n", "0.474390 | \n", "0.847458 | \n", "0.473346 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
996 | \n", "0.064718 | \n", "0.377778 | \n", "679370 | \n", "1.000000 | \n", "0.548183 | \n", "0.909091 | \n", "0.899328 | \n", "0.000000 | \n", "0.292529 | \n", "0.391304 | \n", "... | \n", "0.019856 | \n", "0.040508 | \n", "0.35 | \n", "NaN | \n", "NaN | \n", "1.000000 | \n", "0.666667 | \n", "0.382084 | \n", "0.491525 | \n", "0.383486 | \n", "
997 | \n", "0.620042 | \n", "0.622222 | \n", "272330 | \n", "0.000000 | \n", "0.733042 | \n", "0.727273 | \n", "0.137584 | \n", "0.000000 | \n", "0.464446 | \n", "0.000000 | \n", "... | \n", "0.234474 | \n", "0.418040 | \n", "0.20 | \n", "NaN | \n", "NaN | \n", "1.000000 | \n", "0.666667 | \n", "0.791739 | \n", "0.254237 | \n", "0.793951 | \n", "
998 | \n", "0.524008 | \n", "0.444444 | \n", "315631 | \n", "1.000000 | \n", "0.494747 | \n", "0.090909 | \n", "0.957801 | \n", "0.000000 | \n", "0.714671 | \n", "0.434783 | \n", "... | \n", "0.233629 | \n", "0.416530 | \n", "0.10 | \n", "NaN | \n", "NaN | \n", "0.666667 | \n", "0.500000 | \n", "0.368134 | \n", "0.118644 | \n", "0.371954 | \n", "
999 | \n", "0.112735 | \n", "0.355556 | \n", "445195 | \n", "0.000000 | \n", "0.512898 | \n", "0.090909 | \n", "0.121352 | \n", "0.681592 | \n", "0.621062 | \n", "0.913043 | \n", "... | \n", "0.303760 | \n", "0.722732 | \n", "0.85 | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "0.333333 | \n", "0.824651 | \n", "0.932203 | \n", "0.822454 | \n", "
1000 | \n", "0.323591 | \n", "0.333333 | \n", "914815 | \n", "0.000000 | \n", "0.788882 | \n", "0.090909 | \n", "0.169629 | \n", "0.000000 | \n", "1.000000 | \n", "0.043478 | \n", "... | \n", "0.361639 | \n", "0.752925 | \n", "0.15 | \n", "NaN | \n", "NaN | \n", "0.500000 | \n", "0.333333 | \n", "0.028553 | \n", "0.101695 | \n", "0.033072 | \n", "
1000 rows × 25 columns
\n", "\n", " | policy_number | \n", "fraud_reported | \n", "
---|---|---|
0 | \n", "698589 | \n", "0.587929 | \n", "
1 | \n", "287489 | \n", "0.507880 | \n", "
2 | \n", "211578 | \n", "0.040526 | \n", "
3 | \n", "807369 | \n", "0.045366 | \n", "
4 | \n", "830878 | \n", "0.053479 | \n", "
... | \n", "... | \n", "... | \n", "
295 | \n", "679370 | \n", "0.059729 | \n", "
296 | \n", "272330 | \n", "0.101335 | \n", "
297 | \n", "315631 | \n", "0.069951 | \n", "
298 | \n", "445195 | \n", "0.036823 | \n", "
299 | \n", "914815 | \n", "0.080392 | \n", "
300 rows × 2 columns
\n", "