一、抓取思路
获取天眼查企业详情,最主要是获取cookie中的两个关键参数:token与_utm。
其中token比较简单,从 http://www.tianyancha.com/tongji/xxxx.json 中的 v 参数 可以解析获取。
_utm也不难,主要研究 v 参数与它的主JS文件获取。
二、抓取代码
程序猿还是喜欢看代码,ok, show you the code 🙂
public static List<Company> CrawlCompanyName_TianyanChaV2() { List<Company> companys = new List<Company>(); string userAgent = UserAgentPack.GetUserAgentRandom(); string url = "http://www.tianyancha.com/recentResult.json"; WebRequest wb = new WebRequest(); wb.Url = "http://www.tianyancha.com/"; wb.Method = "GET"; wb.UserAgent = userAgent; StringResult sr = WebClient.DownloadString(wb); wb.Url = url; wb.Reference = "http://www.tianyancha.com/"; wb.Cookie = sr.SetCookie;//CombineCookies(sr.SetCookie, "paaptp=835c9a82f56bfdc533155b8f1d54761cd87c31840600d0d8ac15a7e94856a",url); wb.SpecialHeadCollection = new Dictionary<string, string>() { { "Accept", "application/json, text/plain, */*" } }; wb.Encoding = "utf-8"; StringResult sr2 = WebClient.DownloadString(wb); string html = sr2.HtmlString; //string html = WebClientUtil.DownloadData(url, "application/json"); if (!string.IsNullOrEmpty(html)) { try { JObject jo = (JObject)JsonConvert.DeserializeObject(html); JArray ja = (JArray)jo["data"]; if ((ja == null) && (ja.Count < 1)) { return companys; } JToken jObject = ja[0]; do { string cName = ConvertObject2String(jObject["name"]); if ((!string.IsNullOrEmpty(cName)) && (!string.IsNullOrEmpty(cName))) { Company c = new Company(); //string id = ConvertObject2String(jObject["id"]); string id = "9519792"; // 获取天眼查cookies中的关键:token与_utm,这两个对破解天眼查的详情页js至关重要 //string jsUrl = "http://static.tianyancha.com/wap/resources/scripts/app-c7bd551593.js"; string jsUrl = "http://www.tianyancha.com/tongji/3014593517.json?random=1488861305764" + DateTime.Now.Second; wb.Url = jsUrl; wb.SpecialHeadCollection = new Dictionary<string, string>() { { "Accept", "application/json, text/plain, */*" }, { "Accept-Encoding", " gzip, deflate, sdch" }, { "Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6,sw;q=0.4" }, { "Tyc-From", "normal" }, { "CheckError", "check" } }; wb.Reference = "http://www.tianyancha.com/company/" + id; wb.Cookie = CombineCookies(sr, sr2); StringResult jsSR = WebClient.DownloadString(wb); JObject jsJO = (JObject)JsonConvert.DeserializeObject(jsSR.HtmlString); string v = ConvertObject2String(jsJO["data"]["v"]); string js = " var getTokenStr = function() { var v = \"" + v + "\"; var fnStr = \"\"; for (var arr = v.split(\",\"), i = 0; i < arr.length; i++) fnStr += String.fromCharCode(arr[i]); return fnStr;}"; // !function(n){document.cookie='token=96bf9be4a0a34ba5aff42c3eeb14e526;path=/;'; // n.wtf=function(){return'20,11,31,13,20,29,31,15,0,15,29,0,28,15,31,9,1,12,34,10,28,18,20,34,11,33,0,31,18,20,18,10'}}(window); string tokenStr = JSUtil.GetTokenStr(js); int startIndex = tokenStr.IndexOf("token=") + 5 + 1; int length = tokenStr.IndexOf(";path=") - tokenStr.IndexOf("token=") - 5 - 1; string token = tokenStr.Substring(startIndex, length); int startIndex2 = tokenStr.IndexOf("return'") + 6 + 1; int length2 = tokenStr.IndexOf("'}}(window") - tokenStr.IndexOf("return'") - 6 - 1; string fxck = tokenStr.Substring(startIndex2, length2); //byte[] bytes = new byte[] { 119, 105, 110, 100, 111, 119, 46, 36, 83, 111, 71, 111, 117, 36, 32, 61, 32, 119, 105, 110, 100, 111, 119, 46, 95, 115, 103, 65, 114, 114, 91 }; string[] SoGou = { "1", "8", "o", "s", "z", "u", "n", "v", "m", "b", "9", "f", "d", "7", "h", "c", "p", "y", "2", "0", "3", "j", "-", "i", "l", "k", "t", "q", "4", "6", "r", "a", "w", "5", "e", "x", "g" }; int count = fxck.Split(',').Length; string[] strs = fxck.Split(','); string _utm = ""; for (int i = 0; i < count; i++) { _utm += SoGou[Convert.ToInt32(strs[i])]; // _utm=391cabe10a1f4b9991d6c63a90290352 } string detailUrl = "http://www.tianyancha.com/v2/company/"+id+".json"; wb.Url = detailUrl; wb.ContentType = "application/x-www-form-urlencoded;charset=UTF-8"; //wb.SpecialHeadCollection = new Dictionary<string, string>() { { "Accept", "application/json, text/plain, */*" },{ "Tyc-From", "normal" }, { "CheckError", "check" } }; //wb.Reference = "http://www.tianyancha.com/company/" + id; // wb.Cookie = CombineCookies(sr, sr2); wb.Cookie = CombineCookies(wb.Cookie, "token=" + token,wb.Url); wb.Cookie = CombineCookies(wb.Cookie, "_utm=" + _utm, wb.Url); StringResult sr3 = WebClient.DownloadString(wb); string html3 = sr3.HtmlString; JObject jo3 = (JObject)JsonConvert.DeserializeObject(html3); JToken ja3 = jo3["data"]; c.oc_name = ConvertObject2String(ja3["name"]); c.ct_creditcode = ConvertObject2String(ja3["creditCode"]); c.oc_regOrgName = ConvertObject2String(ja3["regInstitute"]); c.oc_code = ConvertObject2String(ja3["orgNumber"]); c.oc_address = ConvertObject2String(ja3["regLocation"]); c.oc_type = c.oc_name.Contains("公司") ? "企业法人" : ""; c.oc_starttime = ConvertToDateTime(ConvertObject2String(ja3["fromTime"])); c.oc_starttime = ConvertToDateTime(ConvertObject2String(ja3["toTime"])); if (string.IsNullOrEmpty(c.oc_code)) { if (!string.IsNullOrEmpty(c.ct_creditcode)) { c.oc_code = c.ct_creditcode.Substring(8, 9); } } if (!string.IsNullOrEmpty(c.oc_name)) { companys.Add(c); } } jObject = jObject.Next; } while (jObject != null); } catch (Exception ex) { Console.WriteLine("解析失败"); throw ex; } return companys; } return companys; }
代码写起来其实很简单,为了不给天眼查带来影响,主要SoGou代码得到进行了去除。 🙂
我们要养成爬虫工程师的气质,具体了这种气质,有些地方一看就觉得可疑,就可以入手分析了。 🙂