淘宝行为数据分析


数据集简介

本数据集包含了2017年11月25日至2017年12月3日之间,有行为的约一百万随机用户的所有行为(行为包括点击、购买、加购、喜欢)。
数据集的组织形式和MovieLens-20M类似,即数据集的每一行表示一条用户行为,由用户ID、商品ID、商品类目ID、行为类型和时间戳组成,并以逗号分隔

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Simhei']
plt.rcParams['axes.unicode_minus']= False

数据预处理

# 读取数据
data = pd.read_csv(r'UserBehavior.csv')
data.shape
(100150806, 5)
# 由于数据集比较大,本地读取占用内存大,所以这里只取前了10000000行数据
data.head(10000000).to_csv('10000000_data.csv', index=False, header=['User_Id','Item_Id','Category_Id','Behavior_type','Timestamp'])
# 清空内存
del data 
# 读取数据
df = pd.read_csv(r'10000000_data.csv')
df.head()
User_Id Item_Id Category_Id Behavior_type Timestamp
0 1 2333346 2520771 pv 1511561733
1 1 2576651 149192 pv 1511572885
2 1 3830808 4181361 pv 1511593493
3 1 4365585 2520377 pv 1511596146
4 1 4606018 2735466 pv 1511616481

数据清洗

# 检查是否有空值
df.isnull().sum()
User_Id          0
Item_Id          0
Category_Id      0
Behavior_type    0
Timestamp        0
dtype: int64
# 检查是否存在异常值
print(df[df.Timestamp<0].count())
print('-' * 50)
print(df[df.Category_Id<0].count())
print('-' * 50)
print(df[df.Item_Id<0].count())
print('-' * 50)
print(df[df.User_Id<0].count())
print('-' * 50)
print(df.Behavior_type.value_counts())
User_Id          29
Item_Id          29
Category_Id      29
Behavior_type    29
Timestamp        29
dtype: int64
--------------------------------------------------
User_Id          0
Item_Id          0
Category_Id      0
Behavior_type    0
Timestamp        0
dtype: int64
--------------------------------------------------
User_Id          0
Item_Id          0
Category_Id      0
Behavior_type    0
Timestamp        0
dtype: int64
--------------------------------------------------
User_Id          0
Item_Id          0
Category_Id      0
Behavior_type    0
Timestamp        0
dtype: int64
--------------------------------------------------
pv      8950064
cart     559132
fav      291657
buy      199147
Name: Behavior_type, dtype: int64
# Timestamp 字段存在异常值,由于数据量比较大,且异常值数量比较少,故直接删除异常值
df = df[df.Timestamp>0]
# 对Timestamp字段处理,新增一列时间戳转换日期时间
import time
df['dates']= df.loc[:,'Timestamp'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(x)))
df.loc[:,'Date']=df['dates'].apply(lambda x:x.split(' ')[0])
df.loc[:,'Time']=df['dates'].apply(lambda x:x.split(' ')[1])
df = df.drop(labels=['Timestamp','dates'],axis=1) # 删除Timestamp这一列
df.head()
User_Id Item_Id Category_Id Behavior_type Date Time
0 1 2333346 2520771 pv 2017-11-25 06:15:33
1 1 2576651 149192 pv 2017-11-25 09:21:25
2 1 3830808 4181361 pv 2017-11-25 15:04:53
3 1 4365585 2520377 pv 2017-11-25 15:49:06
4 1 4606018 2735466 pv 2017-11-25 21:28:01
# 只取2017年11月25日至2017年12月3日之间的数据进行研究
cleaned_data=df[(df["Date"]>='2017-11-25')&(df["Date"]<='2017-12-03')]
print(cleaned_data["Date"].value_counts())
2017-12-02    1385240
2017-12-03    1369603
2017-12-01    1086983
2017-11-26    1063230
2017-11-30    1038274
2017-11-25    1038049
2017-11-29    1020800
2017-11-27    1004379
2017-11-28     987888
Name: Date, dtype: int64
# 去除重复值
print(cleaned_data.duplicated().value_counts())
cleaned_data = cleaned_data.drop_duplicates()
print('-'* 50)
print(cleaned_data.duplicated().value_counts())
False    9994441
True           5
dtype: int64
--------------------------------------------------
False    9994441
dtype: int64
# 预处理数据保存
# cleaned_data.to_csv(r'cleaned_data.csv', encoding = 'utf-8', index = False)
# 清空内存
# del df

数据分析

# 读取数据
data = pd.read_csv(r'cleaned_data.csv')
data.head()
User_Id Item_Id Category_Id Behavior_type Date Time
0 1 2333346 2520771 pv 2017-11-25 06:15:33
1 1 2576651 149192 pv 2017-11-25 09:21:25
2 1 3830808 4181361 pv 2017-11-25 15:04:53
3 1 4365585 2520377 pv 2017-11-25 15:49:06
4 1 4606018 2735466 pv 2017-11-25 21:28:01
data.info()

RangeIndex: 9994441 entries, 0 to 9994440
Data columns (total 6 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   User_Id        int64 
 1   Item_Id        int64 
 2   Category_Id    int64 
 3   Behavior_type  object
 4   Date           object
 5   Time           object
dtypes: int64(3), object(3)
memory usage: 457.5+ MB
## 数据基本信息
describe1=pd.DataFrame({"用户数量":len(set(list(data["User_Id"]))),"产品数量":len(set(list(data["Item_Id"]))),"产品种类数量":len(set(list(data["Category_Id"]))),"统计天数":len(set(list(data["Date"])))},index=['1'])
describe1
用户数量 产品数量 产品种类数量 统计天数
1 97810 1560525 7966 9

用“人货场”拆解方式建立指标体系

用户指标体系(人)

用户行为统计

describe = data.loc[:,["User_Id","Behavior_type"]]
# 占个坑位,以便后续用于合并
ids = pd.DataFrame(np.zeros(len(set(list(data["User_Id"])))),index=set(list(data["User_Id"])))
pv_class=describe[describe["Behavior_type"]=="pv"].groupby("User_Id").count()
pv_class.columns  = ["pv"]
buy_class=describe[describe["Behavior_type"]=="buy"].groupby("User_Id").count()
buy_class.columns  = ["buy"]
fav_class=describe[describe["Behavior_type"]=="fav"].groupby("User_Id").count()
fav_class.columns  = ["fav"]
cart_class=describe[describe["Behavior_type"]=="cart"].groupby("User_Id").count()
cart_class.columns  = ["cart"]

user_behavior_counts=ids.join(pv_class).join(fav_class).join(cart_class).join(buy_class).iloc[:,1:]
user_behavior_counts.head()
pv fav cart buy
1 54.0 NaN NaN NaN
5 64.0 NaN NaN NaN
262150 19.0 NaN NaN NaN
524294 62.0 11.0 8.0 2.0
524295 110.0 4.0 2.0 3.0

总访问量成交量时间变化分析(天)

## data["Behavior_type"].value_counts()
# pv      8944510
# cart     559132
# fav      291657
# buy      199142

#访问量数据
data_pv = data[data["Behavior_type"]=="pv"][["Behavior_type","Date"]]
data_pv_date=data_pv.groupby(by="Date").count()
# print("访问量数据(单/天)","\n",data_pv_date)
#成交量数据
data_buy = data[data["Behavior_type"]=="buy"][["Behavior_type","Date"]]
data_buy_date=data_buy.groupby(by="Date").count()
# print("成交量数据(单/天)","\n",data_buy_date)

#绘图
plt.figure(figsize=(20,6),dpi =70)
plt.title("总访问量、成交量变化折线图")
ax1 = plt.subplot(1,2,1)
ax2 = plt.subplot(1,2,2)
plt.sca(ax1)
plt.plot(data_pv_date.index,data_pv_date.values,label="pv",color="green",linewidth=2)
plt.title("总访问量变化折线图")
plt.xlabel("Date")
plt.ylabel("Times")

plt.sca(ax2)
plt.title("总成交量变化折线图")

plt.plot(data_buy_date.index,data_buy_date.values,label="buy",linewidth=2)
plt.xlabel("Date")
plt.ylabel("Times")
plt.show()

?

由总访问量、成交量时间变化分析知,从17年11月25日至17年12月1日访问量和成交量存在小幅波动,2017年12月2日访问量和成交量均出现大幅上升,2日、3日两天保持高访问量和高成交量。 此现象原因之一为12月2日和3日为周末,同时考虑2日3日可能存在某些促销活动,可结合实际业务情况进行具体分析。 (图中周五访问量有上升,但成交量出现下降,推测此现象可能与周末活动导致周五推迟成交有关。)

基础指标体系:PV、UV、PV/UV(流量质量)、留存率

# PV、UV、浏览深度
uv = data.groupby('Date').User_Id.nunique().to_frame() # nunique唯一的值
pv = data[data.Behavior_type == 'pv'].groupby('Date').User_Id.size().to_frame() # size统计个数
df2 = pv.merge(uv, on='Date')
df2.columns = ['PV', 'UV']
df2['PV/UV'] = np.round(df2.PV/df2.UV,2)
df2 = df2.reset_index()
df2.head()
Date PV UV PV/UV
0 2017-11-25 931879 69932 13.33
1 2017-11-26 952269 70851 13.44
2 2017-11-27 897421 70215 12.78
3 2017-11-28 883696 70304 12.57
4 2017-11-29 913154 71298 12.81
from pyecharts.globals import CurrentConfig, OnlineHostType   # 事先导入,防止不出图
from pyecharts import options as opts
from pyecharts.charts import Line


line = (
    Line()
    .add_xaxis(df2['Date'].tolist())
    .add_yaxis('PV', df2.PV.tolist(),is_smooth=True)
    .add_yaxis('UV', df2.UV.tolist(),is_smooth=True)
#     .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(title_opts=opts.TitleOpts(title='每日UV和PV情况'))
)
line2 = (
    Line()
    .add_xaxis(df2['Date'].tolist())
    .add_yaxis('PV/UV', df2['PV/UV'].tolist(), color='#675bba')
    
)
line.overlap(line2)
line.render_notebook()

新用户留存率

# 为了计算留存数,只保留其中的User_Id和Date字段
keep = data.loc[:,['User_Id','Date']]
# 定义一个函数计算留存率
def retention(data, groupby='User_Id',date_x='活跃日期',date_y):
    """
    data: 数据集,格式:	User_Id	Date
                        0	1	2017-11-25
                        1	1	2017-11-25
                        2	1	2017-11-25
                        3	1	2017-11-25
                        4	1	2017-11-25
    groupby:关联条件,这里为User_Id
    date_x:用户行为日期字段,这里是左表date
    date_y:关联字段,这里是右表,有两种留存计算方式,分别是:date_y=新用户留存,date_y=老用户留存
    """
    if date_y == '新用户留存':
        act_date = data.groupby(groupby).Date.min().to_frame()
        # 新增一列记录用户最早活跃日期 ---> 新用户留存率
        df3 = data.merge(act_date, on = groupby)
    elif date_y == '老用户留存':
        df3 = data.iloc[:100000,:].merge(keep, on="User_Id") #  自连接的话数据量超大,需继续缩短数据量, 只取前面的十万条记录
        df3 = df3[df3.Date_y >= df3.Date_x] # 过滤活跃用户,右表日期比左表的大
    df3.rename(columns={'Date_x':date_x, 'Date_y':date_y}, inplace = True)# 重命名列
    df3[date_x] = pd.to_datetime(df3[date_x]) # object -> datetime64[ns] 
    df3[date_y] = pd.to_datetime(df3[date_y])
    # https://stackoverflow.com/questions/18215317/extracting-days-from-a-numpy-timedelta64-value
    df3['第n天活跃'] = (df3[date_x] - df3[date_y]).astype('timedelta64[D]').astype(int).abs() # timedelta64[ns] -> float64 -> int
    if date_y == '新用户留存':
        index = date_y 
    elif date_y == '老用户留存':
        index = date_x
    # index=透视的分组
    reten = pd.pivot_table(df3, index=index, columns='第n天活跃', values=[groupby], aggfunc=lambda x:len(x.unique()),fill_value='')
    reten = reten.loc[:,groupby].reset_index().set_index(index)
    for i in reten.iloc[:,1:].columns:
        reten[i] = (reten[i].apply(lambda x:x if x != '' else 0)/reten[0]).apply(lambda x: format(x,'.2%'))
    reten.columns=['首日','1日留存率','2日留存率','3日留存率','4日留存率','5日留存率','6日留存率','7日留存率','8日留存率',]
    return reten
# 计算新用户留存率
retention(keep, date_y='新用户留存')
首日 1日留存率 2日留存率 3日留存率 4日留存率 5日留存率 6日留存率 7日留存率 8日留存率
新用户留存
2017-11-25 69932.0 78.79% 76.84% 76.02% 76.24% 76.72% 77.27% 98.52% 98.32%
2017-11-26 15755.0 64.70% 64.65% 65.90% 66.87% 68.76% 97.77% 97.23% 0.00%
2017-11-27 6283.0 61.53% 62.31% 65.27% 65.84% 97.61% 96.88% 0.00% 0.00%
2017-11-28 3089.0 60.86% 64.62% 67.59% 96.63% 96.67% 0.00% 0.00% 0.00%
2017-11-29 1807.0 69.51% 71.06% 96.35% 96.35% 0.00% 0.00% 0.00% 0.00%
2017-11-30 919.0 94.34% 96.19% 95.10% 0.00% 0.00% 0.00% 0.00% 0.00%
2017-12-01 21.0 100.00% 100.00% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00%
2017-12-02 4.0 100.00% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00%

活跃用户留存率

# 计算老用户留存率
retention(keep, date_y='老用户留存')
首日 1日留存率 2日留存率 3日留存率 4日留存率 5日留存率 6日留存率 7日留存率 8日留存率
活跃日期
2017-11-25 695.0 79.57% 78.56% 78.27% 78.13% 75.68% 76.12% 98.27% 98.71%
2017-11-26 723.0 76.76% 78.15% 76.49% 74.14% 74.69% 97.79% 98.48% 0.00%
2017-11-27 708.0 78.67% 78.53% 77.26% 76.84% 98.59% 99.01% 0.00% 0.00%
2017-11-28 720.0 79.86% 76.11% 78.06% 98.06% 98.61% 0.00% 0.00% 0.00%
2017-11-29 724.0 77.49% 77.21% 98.20% 98.76% 0.00% 0.00% 0.00% 0.00%
2017-11-30 713.0 78.68% 98.18% 98.46% 0.00% 0.00% 0.00% 0.00% 0.00%
2017-12-01 718.0 98.05% 98.75% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00%
2017-12-02 962.0 98.23% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00%
2017-12-03 965.0 0.00% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00%

RFM模型

from IPython.display import Image as IMG
display(IMG(r"RFM.png"))

?

buy_data = data[data.Behavior_type == 'buy']
datediff = pd.to_datetime('2017-12-03') - pd.to_datetime(buy_data.Date)# 最近一次购买
r = {'Recency':datediff.values}
r = pd.DataFrame(r, index=datediff.index)
r_data = buy_data.join(r) 

f = r_data.groupby('User_Id')['Behavior_type'].count() # 购买次数
f_data = pd.DataFrame({'Frequence': f.values, "User_Id": f.index})
RF_data = r_data.merge(f_data, how='left',on='User_Id' )
RF_data.head()
User_Id Item_Id Category_Id Behavior_type Date Time Recency Frequence
0 100 1603476 2951233 buy 2017-11-25 11:18:28 8 days 8
1 100 2971043 4869428 buy 2017-11-25 21:45:49 8 days 8
2 100 598929 2429887 buy 2017-11-27 13:10:58 6 days 8
3 100 1046201 3002561 buy 2017-11-27 15:21:38 6 days 8
4 100 1606258 4098232 buy 2017-11-27 21:39:00 6 days 8
RF_data.Recency.dtype # --> dtype('
User_Id         303707.0
Item_Id        2592687.0
Category_Id    2640118.0
Recency              4.0
Frequence            4.0
Name: 0.5, dtype: float64
User_Id Item_Id Category_Id Behavior_type Date Time Recency Frequence
0 100 1603476 2951233 buy 2017-11-25 11:18:28 8 8
1 100 2971043 4869428 buy 2017-11-25 21:45:49 8 8
2 100 598929 2429887 buy 2017-11-27 13:10:58 6 8
3 100 1046201 3002561 buy 2017-11-27 15:21:38 6 8
4 100 1606258 4098232 buy 2017-11-27 21:39:00 6 8
RF_data['R'] = RF_data.Recency.apply(lambda x: '高' if x <= 4 else '低')
RF_data['F'] = RF_data.Recency.apply(lambda x: '高' if x > 4 else '低')
def rmf(r, f):
    if r == '高' and f == '高':
        return '价值客户'
    elif r == '高' and f == '低':
        return '发展客户'
    if r == '低' and f == '高':
        return '保持客户'
    else:
        return '挽留客户'
RF_data['RFM'] = RF_data.apply(lambda row:rmf(row['R'], row['F']), axis=1)
RF_data.head()
User_Id Item_Id Category_Id Behavior_type Date Time Recency Frequence R F RFM
0 100 1603476 2951233 buy 2017-11-25 11:18:28 8 8 保持客户
1 100 2971043 4869428 buy 2017-11-25 21:45:49 8 8 保持客户
2 100 598929 2429887 buy 2017-11-27 13:10:58 6 8 保持客户
3 100 1046201 3002561 buy 2017-11-27 15:21:38 6 8 保持客户
4 100 1606258 4098232 buy 2017-11-27 21:39:00 6 8 保持客户

通过RMF模型,根据业务对最近一次消费使劲按和消费频率数据进行对用户进行打分。根据高低值打分,将用户分为价值用户、发展用户、保持用户和挽留用户四种,并对不同类别的用户占比进行分析,从而实现差异化营销。

# RFM用户等级分析
from pyecharts.globals import CurrentConfig, OnlineHostType   # 事先导入,防止不出图
from pyecharts.charts import Pie
from pyecharts import options as opts

rmf_data = RF_data['RFM'].value_counts()
y_data = rmf_data.values.tolist()
x_data = rmf_data.index.tolist()
c = (
    Pie()
    .add("", [list(z) for z in zip(x_data,y_data)])
    .set_global_opts(title_opts=opts.TitleOpts(title='RFM用户价值'))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c} ({d}%)"))
)
c.render_notebook()

商品分析(货)

商品的点击量 收藏量 加购量 购买次数 购买转化

def item_analysic(data, groupby='Item_Id'):
    item_data = pd.pivot_table(data, index=groupby, values='User_Id', columns='Behavior_type', aggfunc='count', fill_value='')
    item_users = data.groupby(groupby).User_Id.nunique() # 用户对某个商品产生过行为
    item_buy = data[data.Behavior_type == 'buy'].groupby(groupby).User_Id.nunique() # 某个商品有多少用户购买
    buy_rate = (item_buy/item_users).to_frame().reset_index()
    item_data = item_data.merge(buy_rate, on=groupby, how='left')
    item_data['User_Id'] = item_data.iloc[:,-1].apply(lambda x:float(x) if x > 0 else 0)
    item_data.rename(columns={'User_Id': '购买转化'},inplace=True)
    return item_data
item_analysic(data, groupby='Item_Id')
Item_Id buy cart fav pv 购买转化
0 3 1.0 0.0
1 4 1.0 0.0
2 7 2.0 0.0
3 8 1.0 0.0
4 21 1.0 0.0
... ... ... ... ... ... ...
1560520 5163055 1.0 1.0 0.0
1560521 5163057 2.0 12.0 0.0
1560522 5163060 1.0 0.0
1560523 5163064 2.0 9.0 0.0
1560524 5163067 3.0 0.0

1560525 rows × 6 columns

品类的点击量 收藏量 加购量 购买次数 购买转化

item_analysic(data, groupby='Category_Id')
Category_Id buy cart fav pv 购买转化
0 1147 1.0 0.000000
1 2171 12.0 13.0 3.0 142.0 0.160000
2 2410 1.0 2.0 56.0 0.022222
3 2818 2.0 1.0 0.666667
4 3579 1.0 1.0 14.0 0.142857
... ... ... ... ... ... ...
7961 5157813 3.0 0.000000
7962 5158474 5.0 13.0 2.0 166.0 0.048780
7963 5160054 3.0 0.000000
7964 5161669 11.0 15.0 3.0 118.0 0.214286
7965 5162429 2.0 0.000000

7966 rows × 6 columns

平台指标体系(场)

漏斗模型

功能路径分析

behavior_data = data["Behavior_type"].value_counts()
# pv      8944510
# cart     559132
# fav      291657
# buy      199142
dict_num = {'behavior':behavior_data.index,'numbers':behavior_data.values}
behavior_data = pd.DataFrame(dict_num)
behavior_data.loc[1,"behavior"]="cart+fav"
behavior_data.loc[1,"numbers"]+=behavior_data.loc[2,"numbers"]

behavior_data=behavior_data.loc[[0,1,3],:]
behavior_data
behavior numbers
0 pv 8944510
1 cart+fav 850789
3 buy 199142
temp1 = np.array(behavior_data['numbers'][1:])  
temp2 = np.array(behavior_data['numbers'][0:-1])
# np.array([1,2])/np.array([2,4]) --> array([0.5, 0.5])
single_convs = temp1 / temp2
single_convs = list(single_convs)
single_convs.insert(0,1) # 底层转化率直接赋值1.0
single_convs = [round(x,4) for x in single_convs]  # 利用round函数将转化率保留四位小数
behavior_data['单一环节转化率'] = single_convs
temp3 = np.array(behavior_data['numbers']) # array([8944510,  850789,  199142], dtype=int64)
temp4 = np.ones(len(behavior_data['numbers'])) * behavior_data['numbers'][0] # array([8944510., 8944510., 8944510.])
total_conv = [round(x,4) for x in (temp3/temp4).tolist()]
behavior_data['总体转化率'] = total_conv
behavior_data
behavior numbers 单一环节转化率 总体转化率
0 pv 8944510 1.0000 1.0000
1 cart+fav 850789 0.0951 0.0951
3 buy 199142 0.2341 0.0223
def behavior_convert_font(x):
    if x == 'pv':
        return '曝光'
    elif x == 'cart+fav':
        return '收藏+加入购物车'
    elif x == 'buy':
        return '下单'
behavior_data['behavior'] = behavior_data.behavior.apply(lambda x:behavior_convert_font(x))
# 画图
from pyecharts.charts import Funnel

attrs = behavior_data['behavior'].tolist()
attr_value = (np.array(behavior_data['总体转化率'])* 100).tolist()

funnel = (
    Funnel()
    .add("", [list(z) for z in zip(attrs, attr_value)])
    .set_global_opts(title_opts=opts.TitleOpts(title='总体转化漏斗图'))
    .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}: {d}%'))
)

funnel.render_notebook()

参考资料

https://tianchi.aliyun.com/dataset/dataDetail?dataId=649 [数据来源]
https://gallery.pyecharts.org/#/README [可视化图表]