此采集引擎利用HttpClient实现,支持http与https,支持自定义UserAgent,自定义Header,支持Proxy,支持HTML抓取,也支持图片抓取。此框架分三个部分:WebClient,Webquest以及ResponseResult,其中WebClient为引擎最核心部分,实现了资源下载,而Webquest为请求部分,自定义UserAgent,自定义Header,设置Proxy全部是针对Webquest,ResponseResult为响应部分,包括响应头,响应流,以及响应cookie等。
html抓取测试
// 网页抓取测试 private static void testHTMLSeek(){ String token = ""; try { String status = ""; do { // 通过appkey 和 seckey 获取token String appkey = "youappkey"; String seckey = "youseckey"; WebRequest wb = new WebRequest(); //wb.setProxy("122.4.45.43:3937"); // 设置代理 wb.setMethod("GET"); wb.setUserAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"); wb.setUrl("https://api.qianzhan.com/OpenPlatformService/GetToken?type=JSON&appkey=" + appkey + "&seckey=" + seckey); ResponseResult sr = WebClient.download(wb); String tokenJson = sr.getResponseHtmlStr(); token = ParseJson(ParseJson(tokenJson, "result"), "token"); System.out.println("token:" + token); // 测试【多条件联合搜索】这个接口 JSONObject json = new JSONObject(); json.put("token", token); json.put("type", "JSON"); json.put("companyName","腾讯"); json.put("areaCode", ""); json.put("faRen", ""); json.put("bussinessDes", ""); json.put("address", ""); json.put("gd", ""); json.put("page", "1"); json.put("pagesize", "10"); wb.setMethod("POST"); wb.setUrl("https://api.qianzhan.com/OpenPlatformService/CombineIndexSearch"); wb.setForm(json.toString()); wb.setContentType("application/json"); sr = WebClient.download(wb); String result = sr.getResponseHtmlStr(); System.out.println(result); status = ParseJson(result, "status"); } while (status == "101" || status == "102"); } catch (Exception e1) { e1.printStackTrace(); } }
图片下载测试
// 图片抓取测试 private static void testImageSeek(){ WebRequest req = new WebRequest(); req.setUrl("https://img3.qianzhan.com/news/201803/30/20180330-40fe3e684227ed76_250x150.jpg"); req.setMethod("GET"); ResponseResult rsp = WebClient.download(req); byte[] imageBytes = rsp.getResponseContent(); try { String fileName = "测试图片.png"; String userprofile = System.getenv().get("USERPROFILE"); File file = new File(userprofile + "\\Desktop\\" + fileName); FileOutputStream fops = new FileOutputStream(file); fops.write(imageBytes); fops.flush(); fops.close(); System.out.println("图片已经写入到" + file.getAbsolutePath()); } catch (Exception e) { e.printStackTrace(); } }