你带酒来,我有故事

天眼查企业详情抓取

:: 代码生涯 二十画生 2794℃ 0评论

一、抓取思路

获取天眼查企业详情,最主要是获取cookie中的两个关键参数:token与_utm。

其中token比较简单,从 http://www.tianyancha.com/tongji/xxxx.json 中的 v 参数 可以解析获取。

_utm也不难,主要研究  v 参数与它的主JS文件获取。

 

二、抓取代码

程序猿还是喜欢看代码,ok, show you the code 🙂

public static List<Company> CrawlCompanyName_TianyanChaV2()
{

    List<Company> companys = new List<Company>();

    string userAgent = UserAgentPack.GetUserAgentRandom();
    string url = "http://www.tianyancha.com/recentResult.json";
    WebRequest wb = new WebRequest();
    wb.Url = "http://www.tianyancha.com/";
    wb.Method = "GET";
    wb.UserAgent = userAgent;
    StringResult sr = WebClient.DownloadString(wb);
    wb.Url = url;
    wb.Reference = "http://www.tianyancha.com/";
    wb.Cookie = sr.SetCookie;//CombineCookies(sr.SetCookie, "paaptp=835c9a82f56bfdc533155b8f1d54761cd87c31840600d0d8ac15a7e94856a",url);
    wb.SpecialHeadCollection = new Dictionary<string, string>() { { "Accept", "application/json, text/plain, */*" } };
    wb.Encoding = "utf-8";
    StringResult sr2 = WebClient.DownloadString(wb);
    string html = sr2.HtmlString;
    //string html = WebClientUtil.DownloadData(url, "application/json");

    if (!string.IsNullOrEmpty(html))
    {

        try
        {
            JObject jo = (JObject)JsonConvert.DeserializeObject(html);
            JArray ja = (JArray)jo["data"];

            if ((ja == null) && (ja.Count < 1))
            {
                return companys;
            }

            JToken jObject = ja[0];
            do
            {
                string cName = ConvertObject2String(jObject["name"]);
                if ((!string.IsNullOrEmpty(cName)) && (!string.IsNullOrEmpty(cName)))
                {
                    Company c = new Company();
                    //string id = ConvertObject2String(jObject["id"]);
                    string id = "9519792";

                    // 获取天眼查cookies中的关键:token与_utm,这两个对破解天眼查的详情页js至关重要
                    //string jsUrl = "http://static.tianyancha.com/wap/resources/scripts/app-c7bd551593.js";
                    string jsUrl = "http://www.tianyancha.com/tongji/3014593517.json?random=1488861305764" + DateTime.Now.Second;
                    wb.Url = jsUrl;
                    wb.SpecialHeadCollection = new Dictionary<string, string>() { { "Accept", "application/json, text/plain, */*" }, { "Accept-Encoding", " gzip, deflate, sdch" }, { "Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6,sw;q=0.4" }, { "Tyc-From", "normal" }, { "CheckError", "check" } };
                    wb.Reference = "http://www.tianyancha.com/company/" + id;
                    wb.Cookie = CombineCookies(sr, sr2);
                    StringResult jsSR = WebClient.DownloadString(wb);
                    JObject jsJO = (JObject)JsonConvert.DeserializeObject(jsSR.HtmlString);
                    string v = ConvertObject2String(jsJO["data"]["v"]);
                    string js = " var getTokenStr = function() { var v = \"" + v + "\"; var fnStr = \"\"; for (var arr = v.split(\",\"), i = 0; i < arr.length; i++) fnStr += String.fromCharCode(arr[i]); return fnStr;}";

                    // !function(n){document.cookie='token=96bf9be4a0a34ba5aff42c3eeb14e526;path=/;';
                    //  n.wtf=function(){return'20,11,31,13,20,29,31,15,0,15,29,0,28,15,31,9,1,12,34,10,28,18,20,34,11,33,0,31,18,20,18,10'}}(window);
                    string tokenStr = JSUtil.GetTokenStr(js);
                    int startIndex = tokenStr.IndexOf("token=") + 5 + 1;
                    int length = tokenStr.IndexOf(";path=") - tokenStr.IndexOf("token=") - 5 - 1;
                    string token = tokenStr.Substring(startIndex, length);
                    int startIndex2 = tokenStr.IndexOf("return'") + 6 + 1;
                    int length2 = tokenStr.IndexOf("'}}(window") - tokenStr.IndexOf("return'") - 6 - 1;
                    string fxck = tokenStr.Substring(startIndex2, length2);
                    //byte[] bytes = new byte[] { 119, 105, 110, 100, 111, 119, 46, 36, 83, 111, 71, 111, 117, 36, 32, 61, 32, 119, 105, 110, 100, 111, 119, 46, 95, 115, 103, 65, 114, 114, 91 };

                    string[] SoGou = { "1", "8", "o", "s", "z", "u", "n", "v", "m", "b", "9", "f", "d", "7", "h", "c", "p", "y", "2", "0", "3", "j", "-", "i", "l", "k", "t", "q", "4", "6", "r", "a", "w", "5", "e", "x", "g" };

                    int count = fxck.Split(',').Length;
                    string[] strs = fxck.Split(',');
                    string _utm = "";
                    for (int i = 0; i < count; i++)
                    {
                        _utm += SoGou[Convert.ToInt32(strs[i])]; // _utm=391cabe10a1f4b9991d6c63a90290352
                    }

                    string detailUrl = "http://www.tianyancha.com/v2/company/"+id+".json";
                    wb.Url = detailUrl;
                    wb.ContentType = "application/x-www-form-urlencoded;charset=UTF-8";
                    //wb.SpecialHeadCollection = new Dictionary<string, string>() { { "Accept", "application/json, text/plain, */*" },{ "Tyc-From", "normal" }, { "CheckError", "check" } };
                    //wb.Reference = "http://www.tianyancha.com/company/" + id;
                   // wb.Cookie = CombineCookies(sr, sr2);
                    wb.Cookie = CombineCookies(wb.Cookie, "token=" + token,wb.Url);
                    wb.Cookie = CombineCookies(wb.Cookie, "_utm=" + _utm, wb.Url);
                    StringResult sr3 = WebClient.DownloadString(wb);
                    string html3 = sr3.HtmlString;
                    JObject jo3 = (JObject)JsonConvert.DeserializeObject(html3);
                    JToken ja3 = jo3["data"];
                    c.oc_name = ConvertObject2String(ja3["name"]);
                    c.ct_creditcode = ConvertObject2String(ja3["creditCode"]);
                    c.oc_regOrgName = ConvertObject2String(ja3["regInstitute"]);
                    c.oc_code = ConvertObject2String(ja3["orgNumber"]);
                    c.oc_address = ConvertObject2String(ja3["regLocation"]);
                    c.oc_type = c.oc_name.Contains("公司") ? "企业法人" : "";
                    c.oc_starttime = ConvertToDateTime(ConvertObject2String(ja3["fromTime"]));
                    c.oc_starttime = ConvertToDateTime(ConvertObject2String(ja3["toTime"]));

                    if (string.IsNullOrEmpty(c.oc_code))
                    {
                        if (!string.IsNullOrEmpty(c.ct_creditcode))
                        {
                            c.oc_code = c.ct_creditcode.Substring(8, 9);

                        }
                    }

                    if (!string.IsNullOrEmpty(c.oc_name))
                    {
                        companys.Add(c);
                    }
                }

                jObject = jObject.Next;

            }
            while (jObject != null);
        }
        catch (Exception ex)
        {
            Console.WriteLine("解析失败");
            throw ex;
        }

        return companys;
    }

    return companys;
}

 

代码写起来其实很简单,为了不给天眼查带来影响,主要SoGou代码得到进行了去除。 🙂

我们要养成爬虫工程师的气质,具体了这种气质,有些地方一看就觉得可疑,就可以入手分析了。 🙂

 

 

转载请注明:二十画生 » 天眼查企业详情抓取

喜欢 (6)
发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址