数据预处理——从mysql中读取数据


数据预处理——从mysql中读取数据

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import random

plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
plt.rcParams['axes.unicode_minus'] = False

%config InlineBackend.figure_format = 'svg'
import pymysql

conn = pymysql.connect(
    host='localhost', port=3306,
    user='root', password='',
    database='ai_data', charset='utf8mb4'
)
conn
sql = 'select * from ai_datayuanyin;'# 无效本来就没有
data = pd.read_sql(sql,conn)# Read SQL query or database table into a DataFrame.
data.head()
data.info()
data["Publication Year"].value_counts()
data["Hot Paper Status"].value_counts()
HotPaperStatus = data[data["Hot Paper Status"]=="N"]
HotPaperStatus.to_csv('Hot Paper Status.csv')
data["Highly Cited Status"].value_counts()
HighlyCitedStatus = data[data["Highly Cited Status"]=="Y"]
HighlyCitedStatus.to_csv('Highly Cited Status.csv')
data['Author Full Names'] = data['Author Full Names'].fillna("nan")
data['Author Keywords'] = data['Author Keywords'].fillna("nan")
data['Abstract'] = data['Abstract'].fillna("nan")

# Number of authors
data.loc[:,'Number_of_authors'] = data['Author Full Names'].apply(lambda x: len(x.split(";"))) 
data.loc[:,'Number_of_Keywords'] = data['Author Keywords'].apply(lambda x: len(x.split(";"))) 
data.loc[:,'Length_of_Abstract'] = data['Abstract'].apply(lambda x: len(x.split(" "))) 
# 列号看一下,可能需要改动
final_data = data.iloc[:, [0,1,5,10,11,14,17,19,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,58,59,60]]
final_data.head()
final_data.to_csv("final_data.csv")
# 原文指标完毕
# 被引频次,excel按照年份进行重新排序!!!
beiyinpinci = final_data.iloc[:,[5,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39]]
beiyinpinci.to_csv("beiyinpinci.csv")
# 保存了则读取,excel按照年份进行重新排序!!!
data = pd.read_csv("beiyinpinci.csv", low_memory=False)
final_data = pd.read_csv("final_data.csv", low_memory=False)
data.head()
# 提取需要计算离散系数的数据
data.iloc[:,2:].head()
# 离散系数计算
data[data["Publication Year"]==1990].iloc[:,2:].std(axis = 1, skipna = True)/data.iloc[:,2:].mean(axis = 1, skipna = True)
data.loc[data["Publication Year"]==1990,'Fluctuation_of_annual_citation'] = data[data["Publication Year"]==1990].iloc[:,2:].std(axis = 1, skipna = True)/data.iloc[:,2:].mean(axis = 1, skipna = True)
# 离散系数列Fluctuation_of_annual_citation
for i in range(0,23):
    data.loc[data["Publication Year"]==1990+i,'Fluctuation_of_annual_citation'] = data[data["Publication Year"]==1990+i].iloc[:,i+2:].std(axis = 1, skipna = True)/data.iloc[:,i+2:].mean(axis = 1, skipna = True)
# 存储到final_data中去
final_data.loc[:,'Fluctuation_of_annual_citation']=data["Fluctuation_of_annual_citation"].fillna("nan")
#
final_data.to_csv("final_data.csv")
data.to_csv("beiyinpinci.csv")