数据集简介
本数据集包含了2017年11月25日至2017年12月3日之间,有行为的约一百万随机用户的所有行为(行为包括点击、购买、加购、喜欢)。
数据集的组织形式和MovieLens-20M类似,即数据集的每一行表示一条用户行为,由用户ID、商品ID、商品类目ID、行为类型和时间戳组成,并以逗号分隔
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Simhei']
plt.rcParams['axes.unicode_minus']= False
数据预处理
# 读取数据
data = pd.read_csv(r'UserBehavior.csv')
data.shape
(100150806, 5)
# 由于数据集比较大,本地读取占用内存大,所以这里只取前了10000000行数据
data.head(10000000).to_csv('10000000_data.csv', index=False, header=['User_Id','Item_Id','Category_Id','Behavior_type','Timestamp'])
# 清空内存
del data
# 读取数据
df = pd.read_csv(r'10000000_data.csv')
df.head()
|
User_Id |
Item_Id |
Category_Id |
Behavior_type |
Timestamp |
0 |
1 |
2333346 |
2520771 |
pv |
1511561733 |
1 |
1 |
2576651 |
149192 |
pv |
1511572885 |
2 |
1 |
3830808 |
4181361 |
pv |
1511593493 |
3 |
1 |
4365585 |
2520377 |
pv |
1511596146 |
4 |
1 |
4606018 |
2735466 |
pv |
1511616481 |
数据清洗
# 检查是否有空值
df.isnull().sum()
User_Id 0
Item_Id 0
Category_Id 0
Behavior_type 0
Timestamp 0
dtype: int64
# 检查是否存在异常值
print(df[df.Timestamp<0].count())
print('-' * 50)
print(df[df.Category_Id<0].count())
print('-' * 50)
print(df[df.Item_Id<0].count())
print('-' * 50)
print(df[df.User_Id<0].count())
print('-' * 50)
print(df.Behavior_type.value_counts())
User_Id 29
Item_Id 29
Category_Id 29
Behavior_type 29
Timestamp 29
dtype: int64
--------------------------------------------------
User_Id 0
Item_Id 0
Category_Id 0
Behavior_type 0
Timestamp 0
dtype: int64
--------------------------------------------------
User_Id 0
Item_Id 0
Category_Id 0
Behavior_type 0
Timestamp 0
dtype: int64
--------------------------------------------------
User_Id 0
Item_Id 0
Category_Id 0
Behavior_type 0
Timestamp 0
dtype: int64
--------------------------------------------------
pv 8950064
cart 559132
fav 291657
buy 199147
Name: Behavior_type, dtype: int64
# Timestamp 字段存在异常值,由于数据量比较大,且异常值数量比较少,故直接删除异常值
df = df[df.Timestamp>0]
# 对Timestamp字段处理,新增一列时间戳转换日期时间
import time
df['dates']= df.loc[:,'Timestamp'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(x)))
df.loc[:,'Date']=df['dates'].apply(lambda x:x.split(' ')[0])
df.loc[:,'Time']=df['dates'].apply(lambda x:x.split(' ')[1])
df = df.drop(labels=['Timestamp','dates'],axis=1) # 删除Timestamp这一列
df.head()
|
User_Id |
Item_Id |
Category_Id |
Behavior_type |
Date |
Time |
0 |
1 |
2333346 |
2520771 |
pv |
2017-11-25 |
06:15:33 |
1 |
1 |
2576651 |
149192 |
pv |
2017-11-25 |
09:21:25 |
2 |
1 |
3830808 |
4181361 |
pv |
2017-11-25 |
15:04:53 |
3 |
1 |
4365585 |
2520377 |
pv |
2017-11-25 |
15:49:06 |
4 |
1 |
4606018 |
2735466 |
pv |
2017-11-25 |
21:28:01 |
# 只取2017年11月25日至2017年12月3日之间的数据进行研究
cleaned_data=df[(df["Date"]>='2017-11-25')&(df["Date"]<='2017-12-03')]
print(cleaned_data["Date"].value_counts())
2017-12-02 1385240
2017-12-03 1369603
2017-12-01 1086983
2017-11-26 1063230
2017-11-30 1038274
2017-11-25 1038049
2017-11-29 1020800
2017-11-27 1004379
2017-11-28 987888
Name: Date, dtype: int64
# 去除重复值
print(cleaned_data.duplicated().value_counts())
cleaned_data = cleaned_data.drop_duplicates()
print('-'* 50)
print(cleaned_data.duplicated().value_counts())
False 9994441
True 5
dtype: int64
--------------------------------------------------
False 9994441
dtype: int64
# 预处理数据保存
# cleaned_data.to_csv(r'cleaned_data.csv', encoding = 'utf-8', index = False)
# 清空内存
# del df
数据分析
# 读取数据
data = pd.read_csv(r'cleaned_data.csv')
data.head()
|
User_Id |
Item_Id |
Category_Id |
Behavior_type |
Date |
Time |
0 |
1 |
2333346 |
2520771 |
pv |
2017-11-25 |
06:15:33 |
1 |
1 |
2576651 |
149192 |
pv |
2017-11-25 |
09:21:25 |
2 |
1 |
3830808 |
4181361 |
pv |
2017-11-25 |
15:04:53 |
3 |
1 |
4365585 |
2520377 |
pv |
2017-11-25 |
15:49:06 |
4 |
1 |
4606018 |
2735466 |
pv |
2017-11-25 |
21:28:01 |
data.info()
RangeIndex: 9994441 entries, 0 to 9994440
Data columns (total 6 columns):
# Column Dtype
--- ------ -----
0 User_Id int64
1 Item_Id int64
2 Category_Id int64
3 Behavior_type object
4 Date object
5 Time object
dtypes: int64(3), object(3)
memory usage: 457.5+ MB
## 数据基本信息
describe1=pd.DataFrame({"用户数量":len(set(list(data["User_Id"]))),"产品数量":len(set(list(data["Item_Id"]))),"产品种类数量":len(set(list(data["Category_Id"]))),"统计天数":len(set(list(data["Date"])))},index=['1'])
describe1
|
用户数量 |
产品数量 |
产品种类数量 |
统计天数 |
1 |
97810 |
1560525 |
7966 |
9 |
用“人货场”拆解方式建立指标体系
用户指标体系(人)
用户行为统计
describe = data.loc[:,["User_Id","Behavior_type"]]
# 占个坑位,以便后续用于合并
ids = pd.DataFrame(np.zeros(len(set(list(data["User_Id"])))),index=set(list(data["User_Id"])))
pv_class=describe[describe["Behavior_type"]=="pv"].groupby("User_Id").count()
pv_class.columns = ["pv"]
buy_class=describe[describe["Behavior_type"]=="buy"].groupby("User_Id").count()
buy_class.columns = ["buy"]
fav_class=describe[describe["Behavior_type"]=="fav"].groupby("User_Id").count()
fav_class.columns = ["fav"]
cart_class=describe[describe["Behavior_type"]=="cart"].groupby("User_Id").count()
cart_class.columns = ["cart"]
user_behavior_counts=ids.join(pv_class).join(fav_class).join(cart_class).join(buy_class).iloc[:,1:]
user_behavior_counts.head()
|
pv |
fav |
cart |
buy |
1 |
54.0 |
NaN |
NaN |
NaN |
5 |
64.0 |
NaN |
NaN |
NaN |
262150 |
19.0 |
NaN |
NaN |
NaN |
524294 |
62.0 |
11.0 |
8.0 |
2.0 |
524295 |
110.0 |
4.0 |
2.0 |
3.0 |
总访问量成交量时间变化分析(天)
## data["Behavior_type"].value_counts()
# pv 8944510
# cart 559132
# fav 291657
# buy 199142
#访问量数据
data_pv = data[data["Behavior_type"]=="pv"][["Behavior_type","Date"]]
data_pv_date=data_pv.groupby(by="Date").count()
# print("访问量数据(单/天)","\n",data_pv_date)
#成交量数据
data_buy = data[data["Behavior_type"]=="buy"][["Behavior_type","Date"]]
data_buy_date=data_buy.groupby(by="Date").count()
# print("成交量数据(单/天)","\n",data_buy_date)
#绘图
plt.figure(figsize=(20,6),dpi =70)
plt.title("总访问量、成交量变化折线图")
ax1 = plt.subplot(1,2,1)
ax2 = plt.subplot(1,2,2)
plt.sca(ax1)
plt.plot(data_pv_date.index,data_pv_date.values,label="pv",color="green",linewidth=2)
plt.title("总访问量变化折线图")
plt.xlabel("Date")
plt.ylabel("Times")
plt.sca(ax2)
plt.title("总成交量变化折线图")
plt.plot(data_buy_date.index,data_buy_date.values,label="buy",linewidth=2)
plt.xlabel("Date")
plt.ylabel("Times")
plt.show()
?
由总访问量、成交量时间变化分析知,从17年11月25日至17年12月1日访问量和成交量存在小幅波动,2017年12月2日访问量和成交量均出现大幅上升,2日、3日两天保持高访问量和高成交量。 此现象原因之一为12月2日和3日为周末,同时考虑2日3日可能存在某些促销活动,可结合实际业务情况进行具体分析。 (图中周五访问量有上升,但成交量出现下降,推测此现象可能与周末活动导致周五推迟成交有关。)
基础指标体系:PV、UV、PV/UV(流量质量)、留存率
# PV、UV、浏览深度
uv = data.groupby('Date').User_Id.nunique().to_frame() # nunique唯一的值
pv = data[data.Behavior_type == 'pv'].groupby('Date').User_Id.size().to_frame() # size统计个数
df2 = pv.merge(uv, on='Date')
df2.columns = ['PV', 'UV']
df2['PV/UV'] = np.round(df2.PV/df2.UV,2)
df2 = df2.reset_index()
df2.head()
|
Date |
PV |
UV |
PV/UV |
0 |
2017-11-25 |
931879 |
69932 |
13.33 |
1 |
2017-11-26 |
952269 |
70851 |
13.44 |
2 |
2017-11-27 |
897421 |
70215 |
12.78 |
3 |
2017-11-28 |
883696 |
70304 |
12.57 |
4 |
2017-11-29 |
913154 |
71298 |
12.81 |
from pyecharts.globals import CurrentConfig, OnlineHostType # 事先导入,防止不出图
from pyecharts import options as opts
from pyecharts.charts import Line
line = (
Line()
.add_xaxis(df2['Date'].tolist())
.add_yaxis('PV', df2.PV.tolist(),is_smooth=True)
.add_yaxis('UV', df2.UV.tolist(),is_smooth=True)
# .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(title_opts=opts.TitleOpts(title='每日UV和PV情况'))
)
line2 = (
Line()
.add_xaxis(df2['Date'].tolist())
.add_yaxis('PV/UV', df2['PV/UV'].tolist(), color='#675bba')
)
line.overlap(line2)
line.render_notebook()
新用户留存率
# 为了计算留存数,只保留其中的User_Id和Date字段
keep = data.loc[:,['User_Id','Date']]
# 定义一个函数计算留存率
def retention(data, groupby='User_Id',date_x='活跃日期',date_y):
"""
data: 数据集,格式: User_Id Date
0 1 2017-11-25
1 1 2017-11-25
2 1 2017-11-25
3 1 2017-11-25
4 1 2017-11-25
groupby:关联条件,这里为User_Id
date_x:用户行为日期字段,这里是左表date
date_y:关联字段,这里是右表,有两种留存计算方式,分别是:date_y=新用户留存,date_y=老用户留存
"""
if date_y == '新用户留存':
act_date = data.groupby(groupby).Date.min().to_frame()
# 新增一列记录用户最早活跃日期 ---> 新用户留存率
df3 = data.merge(act_date, on = groupby)
elif date_y == '老用户留存':
df3 = data.iloc[:100000,:].merge(keep, on="User_Id") # 自连接的话数据量超大,需继续缩短数据量, 只取前面的十万条记录
df3 = df3[df3.Date_y >= df3.Date_x] # 过滤活跃用户,右表日期比左表的大
df3.rename(columns={'Date_x':date_x, 'Date_y':date_y}, inplace = True)# 重命名列
df3[date_x] = pd.to_datetime(df3[date_x]) # object -> datetime64[ns]
df3[date_y] = pd.to_datetime(df3[date_y])
# https://stackoverflow.com/questions/18215317/extracting-days-from-a-numpy-timedelta64-value
df3['第n天活跃'] = (df3[date_x] - df3[date_y]).astype('timedelta64[D]').astype(int).abs() # timedelta64[ns] -> float64 -> int
if date_y == '新用户留存':
index = date_y
elif date_y == '老用户留存':
index = date_x
# index=透视的分组
reten = pd.pivot_table(df3, index=index, columns='第n天活跃', values=[groupby], aggfunc=lambda x:len(x.unique()),fill_value='')
reten = reten.loc[:,groupby].reset_index().set_index(index)
for i in reten.iloc[:,1:].columns:
reten[i] = (reten[i].apply(lambda x:x if x != '' else 0)/reten[0]).apply(lambda x: format(x,'.2%'))
reten.columns=['首日','1日留存率','2日留存率','3日留存率','4日留存率','5日留存率','6日留存率','7日留存率','8日留存率',]
return reten
# 计算新用户留存率
retention(keep, date_y='新用户留存')
|
首日 |
1日留存率 |
2日留存率 |
3日留存率 |
4日留存率 |
5日留存率 |
6日留存率 |
7日留存率 |
8日留存率 |
新用户留存 |
|
|
|
|
|
|
|
|
|
2017-11-25 |
69932.0 |
78.79% |
76.84% |
76.02% |
76.24% |
76.72% |
77.27% |
98.52% |
98.32% |
2017-11-26 |
15755.0 |
64.70% |
64.65% |
65.90% |
66.87% |
68.76% |
97.77% |
97.23% |
0.00% |
2017-11-27 |
6283.0 |
61.53% |
62.31% |
65.27% |
65.84% |
97.61% |
96.88% |
0.00% |
0.00% |
2017-11-28 |
3089.0 |
60.86% |
64.62% |
67.59% |
96.63% |
96.67% |
0.00% |
0.00% |
0.00% |
2017-11-29 |
1807.0 |
69.51% |
71.06% |
96.35% |
96.35% |
0.00% |
0.00% |
0.00% |
0.00% |
2017-11-30 |
919.0 |
94.34% |
96.19% |
95.10% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
2017-12-01 |
21.0 |
100.00% |
100.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
2017-12-02 |
4.0 |
100.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
活跃用户留存率
# 计算老用户留存率
retention(keep, date_y='老用户留存')
|
首日 |
1日留存率 |
2日留存率 |
3日留存率 |
4日留存率 |
5日留存率 |
6日留存率 |
7日留存率 |
8日留存率 |
活跃日期 |
|
|
|
|
|
|
|
|
|
2017-11-25 |
695.0 |
79.57% |
78.56% |
78.27% |
78.13% |
75.68% |
76.12% |
98.27% |
98.71% |
2017-11-26 |
723.0 |
76.76% |
78.15% |
76.49% |
74.14% |
74.69% |
97.79% |
98.48% |
0.00% |
2017-11-27 |
708.0 |
78.67% |
78.53% |
77.26% |
76.84% |
98.59% |
99.01% |
0.00% |
0.00% |
2017-11-28 |
720.0 |
79.86% |
76.11% |
78.06% |
98.06% |
98.61% |
0.00% |
0.00% |
0.00% |
2017-11-29 |
724.0 |
77.49% |
77.21% |
98.20% |
98.76% |
0.00% |
0.00% |
0.00% |
0.00% |
2017-11-30 |
713.0 |
78.68% |
98.18% |
98.46% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
2017-12-01 |
718.0 |
98.05% |
98.75% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
2017-12-02 |
962.0 |
98.23% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
2017-12-03 |
965.0 |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
0.00% |
RFM模型
from IPython.display import Image as IMG
display(IMG(r"RFM.png"))
?
buy_data = data[data.Behavior_type == 'buy']
datediff = pd.to_datetime('2017-12-03') - pd.to_datetime(buy_data.Date)# 最近一次购买
r = {'Recency':datediff.values}
r = pd.DataFrame(r, index=datediff.index)
r_data = buy_data.join(r)
f = r_data.groupby('User_Id')['Behavior_type'].count() # 购买次数
f_data = pd.DataFrame({'Frequence': f.values, "User_Id": f.index})
RF_data = r_data.merge(f_data, how='left',on='User_Id' )
RF_data.head()
|
User_Id |
Item_Id |
Category_Id |
Behavior_type |
Date |
Time |
Recency |
Frequence |
0 |
100 |
1603476 |
2951233 |
buy |
2017-11-25 |
11:18:28 |
8 days |
8 |
1 |
100 |
2971043 |
4869428 |
buy |
2017-11-25 |
21:45:49 |
8 days |
8 |
2 |
100 |
598929 |
2429887 |
buy |
2017-11-27 |
13:10:58 |
6 days |
8 |
3 |
100 |
1046201 |
3002561 |
buy |
2017-11-27 |
15:21:38 |
6 days |
8 |
4 |
100 |
1606258 |
4098232 |
buy |
2017-11-27 |
21:39:00 |
6 days |
8 |
RF_data.Recency.dtype # --> dtype('
User_Id 303707.0
Item_Id 2592687.0
Category_Id 2640118.0
Recency 4.0
Frequence 4.0
Name: 0.5, dtype: float64
|
User_Id |
Item_Id |
Category_Id |
Behavior_type |
Date |
Time |
Recency |
Frequence |
0 |
100 |
1603476 |
2951233 |
buy |
2017-11-25 |
11:18:28 |
8 |
8 |
1 |
100 |
2971043 |
4869428 |
buy |
2017-11-25 |
21:45:49 |
8 |
8 |
2 |
100 |
598929 |
2429887 |
buy |
2017-11-27 |
13:10:58 |
6 |
8 |
3 |
100 |
1046201 |
3002561 |
buy |
2017-11-27 |
15:21:38 |
6 |
8 |
4 |
100 |
1606258 |
4098232 |
buy |
2017-11-27 |
21:39:00 |
6 |
8 |
RF_data['R'] = RF_data.Recency.apply(lambda x: '高' if x <= 4 else '低')
RF_data['F'] = RF_data.Recency.apply(lambda x: '高' if x > 4 else '低')
def rmf(r, f):
if r == '高' and f == '高':
return '价值客户'
elif r == '高' and f == '低':
return '发展客户'
if r == '低' and f == '高':
return '保持客户'
else:
return '挽留客户'
RF_data['RFM'] = RF_data.apply(lambda row:rmf(row['R'], row['F']), axis=1)
RF_data.head()
|
User_Id |
Item_Id |
Category_Id |
Behavior_type |
Date |
Time |
Recency |
Frequence |
R |
F |
RFM |
0 |
100 |
1603476 |
2951233 |
buy |
2017-11-25 |
11:18:28 |
8 |
8 |
低 |
高 |
保持客户 |
1 |
100 |
2971043 |
4869428 |
buy |
2017-11-25 |
21:45:49 |
8 |
8 |
低 |
高 |
保持客户 |
2 |
100 |
598929 |
2429887 |
buy |
2017-11-27 |
13:10:58 |
6 |
8 |
低 |
高 |
保持客户 |
3 |
100 |
1046201 |
3002561 |
buy |
2017-11-27 |
15:21:38 |
6 |
8 |
低 |
高 |
保持客户 |
4 |
100 |
1606258 |
4098232 |
buy |
2017-11-27 |
21:39:00 |
6 |
8 |
低 |
高 |
保持客户 |
通过RMF模型,根据业务对最近一次消费使劲按和消费频率数据进行对用户进行打分。根据高低值打分,将用户分为价值用户、发展用户、保持用户和挽留用户四种,并对不同类别的用户占比进行分析,从而实现差异化营销。
# RFM用户等级分析
from pyecharts.globals import CurrentConfig, OnlineHostType # 事先导入,防止不出图
from pyecharts.charts import Pie
from pyecharts import options as opts
rmf_data = RF_data['RFM'].value_counts()
y_data = rmf_data.values.tolist()
x_data = rmf_data.index.tolist()
c = (
Pie()
.add("", [list(z) for z in zip(x_data,y_data)])
.set_global_opts(title_opts=opts.TitleOpts(title='RFM用户价值'))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c} ({d}%)"))
)
c.render_notebook()
商品分析(货)
商品的点击量 收藏量 加购量 购买次数 购买转化
def item_analysic(data, groupby='Item_Id'):
item_data = pd.pivot_table(data, index=groupby, values='User_Id', columns='Behavior_type', aggfunc='count', fill_value='')
item_users = data.groupby(groupby).User_Id.nunique() # 用户对某个商品产生过行为
item_buy = data[data.Behavior_type == 'buy'].groupby(groupby).User_Id.nunique() # 某个商品有多少用户购买
buy_rate = (item_buy/item_users).to_frame().reset_index()
item_data = item_data.merge(buy_rate, on=groupby, how='left')
item_data['User_Id'] = item_data.iloc[:,-1].apply(lambda x:float(x) if x > 0 else 0)
item_data.rename(columns={'User_Id': '购买转化'},inplace=True)
return item_data
item_analysic(data, groupby='Item_Id')
|
Item_Id |
buy |
cart |
fav |
pv |
购买转化 |
0 |
3 |
|
|
|
1.0 |
0.0 |
1 |
4 |
|
|
|
1.0 |
0.0 |
2 |
7 |
|
|
|
2.0 |
0.0 |
3 |
8 |
|
|
1.0 |
|
0.0 |
4 |
21 |
|
|
|
1.0 |
0.0 |
... |
... |
... |
... |
... |
... |
... |
1560520 |
5163055 |
|
|
1.0 |
1.0 |
0.0 |
1560521 |
5163057 |
|
2.0 |
|
12.0 |
0.0 |
1560522 |
5163060 |
|
|
|
1.0 |
0.0 |
1560523 |
5163064 |
|
|
2.0 |
9.0 |
0.0 |
1560524 |
5163067 |
|
|
|
3.0 |
0.0 |
1560525 rows × 6 columns
品类的点击量 收藏量 加购量 购买次数 购买转化
item_analysic(data, groupby='Category_Id')
|
Category_Id |
buy |
cart |
fav |
pv |
购买转化 |
0 |
1147 |
|
|
|
1.0 |
0.000000 |
1 |
2171 |
12.0 |
13.0 |
3.0 |
142.0 |
0.160000 |
2 |
2410 |
1.0 |
2.0 |
|
56.0 |
0.022222 |
3 |
2818 |
2.0 |
|
|
1.0 |
0.666667 |
4 |
3579 |
1.0 |
1.0 |
|
14.0 |
0.142857 |
... |
... |
... |
... |
... |
... |
... |
7961 |
5157813 |
|
|
|
3.0 |
0.000000 |
7962 |
5158474 |
5.0 |
13.0 |
2.0 |
166.0 |
0.048780 |
7963 |
5160054 |
|
|
|
3.0 |
0.000000 |
7964 |
5161669 |
11.0 |
15.0 |
3.0 |
118.0 |
0.214286 |
7965 |
5162429 |
|
|
|
2.0 |
0.000000 |
7966 rows × 6 columns
平台指标体系(场)
漏斗模型
功能路径分析
behavior_data = data["Behavior_type"].value_counts()
# pv 8944510
# cart 559132
# fav 291657
# buy 199142
dict_num = {'behavior':behavior_data.index,'numbers':behavior_data.values}
behavior_data = pd.DataFrame(dict_num)
behavior_data.loc[1,"behavior"]="cart+fav"
behavior_data.loc[1,"numbers"]+=behavior_data.loc[2,"numbers"]
behavior_data=behavior_data.loc[[0,1,3],:]
behavior_data
|
behavior |
numbers |
0 |
pv |
8944510 |
1 |
cart+fav |
850789 |
3 |
buy |
199142 |
temp1 = np.array(behavior_data['numbers'][1:])
temp2 = np.array(behavior_data['numbers'][0:-1])
# np.array([1,2])/np.array([2,4]) --> array([0.5, 0.5])
single_convs = temp1 / temp2
single_convs = list(single_convs)
single_convs.insert(0,1) # 底层转化率直接赋值1.0
single_convs = [round(x,4) for x in single_convs] # 利用round函数将转化率保留四位小数
behavior_data['单一环节转化率'] = single_convs
temp3 = np.array(behavior_data['numbers']) # array([8944510, 850789, 199142], dtype=int64)
temp4 = np.ones(len(behavior_data['numbers'])) * behavior_data['numbers'][0] # array([8944510., 8944510., 8944510.])
total_conv = [round(x,4) for x in (temp3/temp4).tolist()]
behavior_data['总体转化率'] = total_conv
behavior_data
|
behavior |
numbers |
单一环节转化率 |
总体转化率 |
0 |
pv |
8944510 |
1.0000 |
1.0000 |
1 |
cart+fav |
850789 |
0.0951 |
0.0951 |
3 |
buy |
199142 |
0.2341 |
0.0223 |
def behavior_convert_font(x):
if x == 'pv':
return '曝光'
elif x == 'cart+fav':
return '收藏+加入购物车'
elif x == 'buy':
return '下单'
behavior_data['behavior'] = behavior_data.behavior.apply(lambda x:behavior_convert_font(x))
# 画图
from pyecharts.charts import Funnel
attrs = behavior_data['behavior'].tolist()
attr_value = (np.array(behavior_data['总体转化率'])* 100).tolist()
funnel = (
Funnel()
.add("", [list(z) for z in zip(attrs, attr_value)])
.set_global_opts(title_opts=opts.TitleOpts(title='总体转化漏斗图'))
.set_series_opts(label_opts=opts.LabelOpts(formatter='{b}: {d}%'))
)
funnel.render_notebook()
参考资料
https://tianchi.aliyun.com/dataset/dataDetail?dataId=649 [数据来源]
https://gallery.pyecharts.org/#/README [可视化图表]