C# 采集知网


采集知网

WebClient

 /// 
    /// 支持 Session 和 Cookie 的 WebClient。
    /// 
    public class WebClientHelper:WebClient {

        // Cookie 容器
        private CookieContainer cookieContainer;

        /// 
        /// 创建一个新的 WebClient 实例。
        /// 
        public WebClientHelper() {
            this.cookieContainer = new CookieContainer();
        }

        /// 
        /// 创建一个新的 WebClient 实例。
        /// 
        /// Cookie 容器
        public WebClientHelper(CookieContainer cookies) {
            this.cookieContainer = cookies;
        }

        /// 
        /// Cookie 容器
        /// 
        public CookieContainer Cookies {
            get { return this.cookieContainer; }
            set { this.cookieContainer = value; }
        }
        // 设置自己想要添加的登陆信息等cookie
        public void SetCustCookie(string name,string value,string path,string domain) {
            CookieCollection cookies = new CookieCollection();
            cookies.Add(new Cookie(name.Trim(),value.Trim(),path,domain));
            Cookies.Add(cookies);
        }
        /// 
        /// 返回带有 Cookie 的 HttpWebRequest。
        /// 
        /// 
        /// 
        protected override WebRequest GetWebRequest(Uri address) {
            WebRequest request = base.GetWebRequest(address);
            if(request is HttpWebRequest) {
                HttpWebRequest httpRequest = request as HttpWebRequest;
                httpRequest.CookieContainer = cookieContainer;
            }
            return request;
        }

        #region 封装了PostData, GetSrc 和 GetFile 方法

        /// 
        /// 向指定的 URL POST 数据,并返回页面
        /// 
        /// POST URL
        /// POST 的 数据
        /// POST 数据的 CharSet https://blog.csdn.net/gengyiping18/article/details/77620061
        /// 页面的 CharSet
        /// 页面的源文件
        public string PostData(string uriString,string postString,string postStringEncoding = "utf-8",string dataEncoding = "utf-8") {
            try {
                // 将 Post 字符串转换成字节数组
                byte[] postData = Encoding.GetEncoding(postStringEncoding).GetBytes(postString);
                this.Headers.Add("Content-Type","application/x-www-form-urlencoded");
                // 上传数据,返回页面的字节数组
                byte[] responseData = this.UploadData(uriString,"POST",postData);
                string srcString = Encoding.GetEncoding(dataEncoding).GetString(responseData);
                srcString = srcString.Replace("\t","");
                srcString = srcString.Replace("\r","");
                srcString = srcString.Replace("\n","");
                return srcString;
            } catch(WebException we) {
                Logger.LogError("PostData:" + uriString + "?" + postString,we);
                return "error";
            }
        }

        /// 
        /// 获得指定 URL 的源文件
        /// 
        /// 页面 URL
        /// 页面的 CharSet
        /// 页面的源文件
        public string GetSrc(string uriString,string dataEncoding = "utf-8") {
            try {
                // 返回页面的字节数组
                byte[] responseData = this.DownloadData(uriString);
                // 将返回的将字节数组转换成字符串(HTML);
                string srcString = Encoding.GetEncoding(dataEncoding).GetString(responseData);
                srcString = srcString.Replace("\t","");
                srcString = srcString.Replace("\r","");
                srcString = srcString.Replace("\n","");
                return srcString;
            } catch(WebException we) {
                Logger.LogError("GetSrc:" + uriString,we);
                return "error";
            }
        }

        /// 
        /// 从指定的 URL 下载文件到本地
        /// 
        /// 文件 URL
        /// 本地文件的完成路径
        /// 
        public bool GetFile(string urlString,string fileName,out string msg) {
            try {
                this.DownloadFile(urlString,fileName);
                msg = string.Empty;
                return true;
            } catch(WebException we) {
                msg = we.Message;
                return false;
            }
        }

        /// 
        /// 遍历CookieContainer
        /// 
        /// 
        /// 
        public  List GetAllCookies(CookieContainer cc) {
            List lstCookies = new List();
            Hashtable table = (Hashtable)cc.GetType().InvokeMember("m_domainTable",
                System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField|
                System.Reflection.BindingFlags.Instance,null,cc,new object[] { });

            foreach(object pathList in table.Values) {
                SortedList lstCookieCol = (SortedList)pathList.GetType().InvokeMember("m_list",
                    System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField
                    |System.Reflection.BindingFlags.Instance,null,pathList,new object[] { });
                foreach(CookieCollection colCookies in lstCookieCol.Values)
                    foreach(Cookie c in colCookies) lstCookies.Add(c);
            }
            return lstCookies;
        }
        public List GetAllCookies() {
            CookieContainer cc = this.cookieContainer;
            List lstCookies = new List();
            Hashtable table = (Hashtable)cc.GetType().InvokeMember("m_domainTable",
                System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField|
                System.Reflection.BindingFlags.Instance,null,cc,new object[] { });

            foreach(object pathList in table.Values) {
                SortedList lstCookieCol = (SortedList)pathList.GetType().InvokeMember("m_list",
                    System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField
                    |System.Reflection.BindingFlags.Instance,null,pathList,new object[] { });
                foreach(CookieCollection colCookies in lstCookieCol.Values)
                    foreach(Cookie c in colCookies) lstCookies.Add(c);
            }
            return lstCookies;
        }
        #endregion 封装了PostData, GetSrc 和 GetFile 方法
    }

MAIN   采集知网("知网","",0,0);

  public void 采集知网(string type,string StrUnit,int minPage,int maxPage) {
            if(string.IsNullOrWhiteSpace(type)) type="知网";
            if(string.IsNullOrWhiteSpace(StrUnit)) StrUnit=BaseDAL.GetSystemInstitutions();
            if(minPage<=0) minPage=1;
            if(maxPage<=0) maxPage=10;
            string url = "https://kns.cnki.net/kns/request/SearchHandler.ashx";
            Dictionary<string,string> postData = new Dictionary<string,string>();
            postData.Add("pagename","brief_result_aspx");
            postData.Add("dbPrefix","SCDB");
            postData.Add("dbCatalog","中国学术文献网络出版总库");
            postData.Add("ConfigFile","SCDB.xml");
            postData.Add("db_opt","CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD");
            postData.Add("danwei_1_sel","AF");
            postData.Add("danwei_1_value1",""+StrUnit+"");
            postData.Add("publishdate_from",DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd"));//2022-6-18
            postData.Add("publishdate_to",DateTime.Now.ToString("yyyy-MM-dd"));//2022-6-18
            postData.Add("danwei_1_special1","=");
            postData.Add("isinEn","1");
            string postDataStr = postData.ToUrlParams();
            //建立会话
            WebClientHelper webClient = new WebClientHelper();
            string mainData = webClient.PostData(url,postDataStr);
            var cli = webClient.GetAllCookies();
            string cookie = cli[0].ToString();
            Dictionary<string,string> header = new Dictionary<string,string>();
            header.Add("Cookie",cookie);
            // 查询获取总条数
            url="https://kns.cnki.net/kns/brief/brief.aspx?pagename=ASP.brief_result_aspx&isinEn=1&dbPrefix=SCDB&dbCatalog=%e4%b8%ad%e5%9b%bd%e5%ad%a6%e6%9c%af%e6%96%87%e7%8c%ae%e7%bd%91%e7%bb%9c%e5%87%ba%e7%89%88%e6%80%bb%e5%ba%93&ConfigFile=SCDB.xml&research=off&t=1592202301266";
            string res = webClient.PostData(url,postDataStr);
            string preg_cookie = "id=\"resultcount\" name=\"resultcount\" value=\"([0-9]*)\"";
            var rg = Regex.Match(res,preg_cookie);
            var num = rg.Groups[1].Value.ToDouble();
            Console.WriteLine("num:"+num);
            preg_cookie="QueryID=[0-9]*";
            rg=Regex.Match(res,preg_cookie,RegexOptions.Singleline);
            string QueryID = rg.Value.Replace("QueryID=","");
            Console.WriteLine("QueryID:"+QueryID);
            if(num<=0) {
                Console.WriteLine("没有采集到新数据");
                return;
            }
            var cot = Math.Round(num/50);
            //分页获取所有数据
            string data = "";
            for(int i = minPage;i<=cot;i++) {
                if(i>maxPage) break;
                url="https://kns.cnki.net/kns/brief/brief.aspx?curpage="+i+"&RecordsPerPage=50&QueryID="+QueryID+"&ID=&turnpage=1&tpagemode=L&dbPrefix=SCDB&Fields=&DisplayMode=custommode&PageName=ASP.brief_result_aspx&sKuaKuID=0&isinEn=1&";
                res=webClient.PostData(url,postDataStr);
                data+=res;
            }
            var regex = new Regex(@"
  • (.*?)<\/li>"); var ret = regex.Matches(data); Dictionary<string,string> pattern = new Dictionary<string,string>(); pattern.Add("title",@"class=""title_c"">(.*)(.*)"); pattern.Add("author",@"class=""author""> (.*),"); pattern.Add("journal",@"class=""journal"">[\s\S]*?<\/span>"); pattern.Add("database",@"class=""database"">(.*?)"); pattern.Add("pub_dates",@""); pattern.Add("abstract_c",@"[\s\S]*?<\/p>"); List modes = new List(); foreach(Match mat in ret) { LunWen model = new LunWen(); model.wt_Type=type; model.wt_DownDate=DateTime.Now; string row = mat.Groups[1].Value; var title = Regex.Match(row,pattern["title"],RegexOptions.Singleline); if(!string.IsNullOrWhiteSpace(title.Value)) { model.wt_Title=title.Groups[2].Value; model.wt_Url="https://kns.cnki.net"+title.Groups[1].Value; //作者 var authorAndLink = Regex.Match(row,pattern["author"]); if(!string.IsNullOrWhiteSpace(authorAndLink.Value)) { string authors_names = ""; var autReg = Regex.Matches(authorAndLink.Value,@"[\u4E00-\u9FFF]+",RegexOptions.IgnoreCase|RegexOptions.IgnorePatternWhitespace); for(int j = 0;j) { authors_names+=","+autReg[j].Value; } model.wt_Author=authors_names; } //单位 var journal = Regex.Match(row,pattern["journal"]); if(!string.IsNullOrWhiteSpace(journal.Value)) { model.wt_Unit=GetChineseWord(journal.Value); } //期刊论文 var database = Regex.Match(row,pattern["database"]); if(!string.IsNullOrWhiteSpace(database.Value)) { model.wt_ResouceType=GetChineseWord(database.Value); model.wt_Source=database.Groups[1].Value; } //2022年01期 var pub_dates = Regex.Match(row,pattern["pub_dates"]); if(!string.IsNullOrWhiteSpace(pub_dates.Value)) { model.wt_Volume=pub_dates.Groups[1].Value; } //内容 var abstract_c = Regex.Match(row,pattern["abstract_c"]); if(!string.IsNullOrWhiteSpace(abstract_c.Value)) { model.wt_Content=abstract_c.Value; } modes.Add(model); Console.WriteLine(type+":"+model.wt_Title); } } if(modes.Count>0) { Console.WriteLine("采集到新数据准备入库"); LunWenDal.AddBatch(modes); Console.WriteLine("采集到新数据已入库完成"); } else { Console.WriteLine("没有采集到新数据"); } }
  • 获取中文:

           public static string GetChineseWord(string oriText) {
                string x = @"[\u4E00-\u9FFF]+";
                MatchCollection Matches = Regex.Matches(oriText,x,RegexOptions.IgnoreCase);
                StringBuilder sb = new StringBuilder();
                foreach(Match NextMatch in Matches) {
                    sb.Append(NextMatch.Value);
                }
                return sb.ToString();
            }

    实体类:

    public partial class LunWen {
            /// 
            /// 论文网推 LunWen
            /// 
            public LunWen() {
    
            }
    
            /// 
            /// wt_id
            /// 
            public int wt_id { get; set; }
            /// 
            /// 单位名称
            /// 
            public string wt_Unit { get; set; }
            /// 
            /// 知网、万文
            /// 
            public string wt_Type { get; set; }
            /// 
            /// 论文标题
            /// 
            public string wt_Title { get; set; }
            /// 
            /// 论文地址
            /// 
            public string wt_Url { get; set; }
            /// 
            /// 类型
            /// 
            public string wt_ResouceType { get; set; }
            /// 
            /// 作者
            /// 
            public string wt_Author { get; set; }
            /// 
            /// 来源
            /// 
            public string wt_Source { get; set; }
            /// 
            /// 期刊
            /// 
            public string wt_Volume { get; set; }
            /// 
            /// 内容
            /// 
            public string wt_Content { get; set; }
            /// 
            /// 下载日期
            /// 
    
            public DateTime? wt_DownDate { get; set; }
        }