|
@@ -1,6 +1,6 @@
|
|
|
# -*- coding: utf-8 -*-
|
|
|
"""
|
|
|
-Created on 2024-04-26
|
|
|
+Created on 2025-07-04
|
|
|
---------
|
|
|
@summary: 广东省政府采购网 - 详情页
|
|
|
---------
|
|
@@ -13,7 +13,6 @@ sys.path.append(os.path.dirname(os.getcwd()))
|
|
|
from utils.tools import *
|
|
|
from utils.attachment import AttachmentDownloader
|
|
|
from threading import Timer
|
|
|
-from utils.clean_html import cleaner
|
|
|
import requests
|
|
|
import re
|
|
|
import time
|
|
@@ -22,11 +21,181 @@ import execjs
|
|
|
from parsel import Selector
|
|
|
from collections import namedtuple
|
|
|
|
|
|
+def ctx():
|
|
|
+ ex_js = '''
|
|
|
+ window = global
|
|
|
+
|
|
|
+ JSEncrypt = require('jsencrypt')
|
|
|
+ CryptoJs = require('crypto-js')
|
|
|
+
|
|
|
+ function K(value) {
|
|
|
+ var encrypt = new JSEncrypt;
|
|
|
+ var RSAPublicKey = 'MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCS2TZDs5+orLYCL5SsJ54+bPCVs1ZQQwP2RoPkFQF2jcT0HnNNT8ZoQgJTrGwNi5QNTBDoHC4oJesAVYe6DoxXS9Nls8WbGE8ZNgOC5tVv1WVjyBw7k2x72C/qjPoyo/kO7TYl6Qnu4jqW/ImLoup/nsJppUznF0YgbyU/dFFNBQIDAQAB';
|
|
|
+ encrypt.setPublicKey('-----BEGIN PUBLIC KEY-----' + RSAPublicKey + '-----END PUBLIC KEY-----')
|
|
|
+ return encrypt.encrypt(value)
|
|
|
+ }
|
|
|
+
|
|
|
+ function mm(e, t) {
|
|
|
+ return e += `_${t}_bosssoft_platform_095285`,
|
|
|
+ t = CryptoJs.SHA1(e).toString(),
|
|
|
+ CryptoJs.MD5(t).toString()
|
|
|
+ }
|
|
|
+
|
|
|
+ function get_njs(e){
|
|
|
+ var t = (new Date).getTime();
|
|
|
+ return K(String(e).split("?")[0] + "$$" + t)
|
|
|
+ }
|
|
|
+
|
|
|
+ function decode_str(e) {
|
|
|
+ var n, t, a, c, i, r;
|
|
|
+ n = "",
|
|
|
+ a = e.length,
|
|
|
+ t = 0;
|
|
|
+ while (t < a)
|
|
|
+ switch (c = e.charCodeAt(t++),
|
|
|
+ c >> 4) {
|
|
|
+ case 0:
|
|
|
+ case 1:
|
|
|
+ case 2:
|
|
|
+ case 3:
|
|
|
+ case 4:
|
|
|
+ case 5:
|
|
|
+ case 6:
|
|
|
+ case 7:
|
|
|
+ n += e.charAt(t - 1);
|
|
|
+ break;
|
|
|
+ case 12:
|
|
|
+ case 13:
|
|
|
+ i = e.charCodeAt(t++),
|
|
|
+ n += String.fromCharCode((31 & c) << 6 | 63 & i);
|
|
|
+ break;
|
|
|
+ case 14:
|
|
|
+ i = e.charCodeAt(t++),
|
|
|
+ r = e.charCodeAt(t++),
|
|
|
+ n += String.fromCharCode((15 & c) << 12 | (63 & i) << 6 | (63 & r) << 0);
|
|
|
+ break
|
|
|
+ }
|
|
|
+ return n
|
|
|
+ }
|
|
|
+
|
|
|
+ function de_str(e) {
|
|
|
+ var n, t, a, c, i, r, o, u = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1];
|
|
|
+ r = e.length,
|
|
|
+ i = 0,
|
|
|
+ o = "";
|
|
|
+ while (i < r) {
|
|
|
+ do {
|
|
|
+ n = u[255 & e.charCodeAt(i++)]
|
|
|
+ } while (i < r && -1 == n);
|
|
|
+ if (-1 == n)
|
|
|
+ break;
|
|
|
+ do {
|
|
|
+ t = u[255 & e.charCodeAt(i++)]
|
|
|
+ } while (i < r && -1 == t);
|
|
|
+ if (-1 == t)
|
|
|
+ break;
|
|
|
+ o += String.fromCharCode(n << 2 | (48 & t) >> 4);
|
|
|
+ do {
|
|
|
+ if (a = 255 & e.charCodeAt(i++),
|
|
|
+ 61 == a)
|
|
|
+ return o;
|
|
|
+ a = u[a]
|
|
|
+ } while (i < r && -1 == a);
|
|
|
+ if (-1 == a)
|
|
|
+ break;
|
|
|
+ o += String.fromCharCode((15 & t) << 4 | (60 & a) >> 2);
|
|
|
+ do {
|
|
|
+ if (c = 255 & e.charCodeAt(i++),
|
|
|
+ 61 == c)
|
|
|
+ return o;
|
|
|
+ c = u[c]
|
|
|
+ } while (i < r && -1 == c);
|
|
|
+ if (-1 == c)
|
|
|
+ break;
|
|
|
+ o += String.fromCharCode((3 & a) << 6 | c)
|
|
|
+ }
|
|
|
+ return o
|
|
|
+ }
|
|
|
+
|
|
|
+ function get_data(data){
|
|
|
+ return decode_str(de_str(data))
|
|
|
+ }
|
|
|
+
|
|
|
+ var i = 0
|
|
|
+ , r = 8;
|
|
|
+
|
|
|
+ function a(e, t) {
|
|
|
+ e[t >> 5] |= 128 << 24 - t % 32,
|
|
|
+ e[15 + (t + 64 >> 9 << 4)] = t;
|
|
|
+ for (var n, i, r, o = Array(80), a = 1732584193, u = -271733879, c = -1732584194, h = 271733878, d = -1009589776, f = 0; f < e.length; f += 16) {
|
|
|
+ for (var p = a, m = u, g = c, v = h, C = d, y = 0; y < 80; y++) {
|
|
|
+ o[y] = y < 16 ? e[f + y] : l(o[y - 3] ^ o[y - 8] ^ o[y - 14] ^ o[y - 16], 1);
|
|
|
+ var A = s(s(l(a, 5), (A = u,
|
|
|
+ i = c,
|
|
|
+ r = h,
|
|
|
+ (n = y) < 20 ? A & i | ~A & r : !(n < 40) && n < 60 ? A & i | A & r | i & r : A ^ i ^ r)), s(s(d, o[y]), (n = y) < 20 ? 1518500249 : n < 40 ? 1859775393 : n < 60 ? -1894007588 : -899497514));
|
|
|
+ d = h,
|
|
|
+ h = c,
|
|
|
+ c = l(u, 30),
|
|
|
+ u = a,
|
|
|
+ a = A
|
|
|
+ }
|
|
|
+ a = s(a, p),
|
|
|
+ u = s(u, m),
|
|
|
+ c = s(c, g),
|
|
|
+ h = s(h, v),
|
|
|
+ d = s(d, C)
|
|
|
+ }
|
|
|
+ return Array(a, u, c, h, d)
|
|
|
+ }
|
|
|
+
|
|
|
+ function s(e, t) {
|
|
|
+ var n = (65535 & e) + (65535 & t);
|
|
|
+ return (e >> 16) + (t >> 16) + (n >> 16) << 16 | 65535 & n
|
|
|
+ }
|
|
|
+
|
|
|
+ function l(e, t) {
|
|
|
+ return e << t | e >>> 32 - t
|
|
|
+ }
|
|
|
+
|
|
|
+ function u(e) {
|
|
|
+ for (var t = Array(), n = (1 << r) - 1, i = 0; i < e.length * r; i += r)
|
|
|
+ t[i >> 5] |= (e.charCodeAt(i / r) & n) << 24 - i % 32;
|
|
|
+ return t
|
|
|
+ }
|
|
|
+
|
|
|
+ function c(e) {
|
|
|
+ for (var t = i ? "0123456789ABCDEF" : "0123456789abcdef", n = "", r = 0; r < 4 * e.length; r++)
|
|
|
+ n += t.charAt(e[r >> 2] >> 8 * (3 - r % 4) + 4 & 15) + t.charAt(e[r >> 2] >> 8 * (3 - r % 4) & 15);
|
|
|
+ return n
|
|
|
+ }
|
|
|
+
|
|
|
+ function h(e) {
|
|
|
+ return c(a(u(e), e.length * 8))
|
|
|
+ }
|
|
|
+
|
|
|
+ function pp(e) {
|
|
|
+ return CryptoJs.MD5(e).toString()
|
|
|
+ }
|
|
|
+
|
|
|
+ function mme(e, t) {
|
|
|
+ var n = (new Date).getTime()
|
|
|
+ , i = pp(h(`${n}_${String(e).split("?")[0]}_bosssoft_platform_095285`))
|
|
|
+ , r = {};
|
|
|
+ return r.time = n,
|
|
|
+ r.url = String(e).split("?")[0],
|
|
|
+ r.sign = i,
|
|
|
+ null != t && "" !== t && (i = p(h("" + t + String(e).split("?")[0] + n)),
|
|
|
+ r.tokensign = i),
|
|
|
+ r
|
|
|
+ }
|
|
|
+ '''
|
|
|
+ return execjs.compile(ex_js)
|
|
|
|
|
|
class Details:
|
|
|
|
|
|
def __init__(self):
|
|
|
- self.proxy = get_proxy()
|
|
|
+ self.proxy = get_QGIP()
|
|
|
self.db_table = Mongo_client().py_spider
|
|
|
self.db_name = self.db_table.theme_list
|
|
|
self.zt_details = self.db_table.data_bak
|
|
@@ -34,44 +203,6 @@ class Details:
|
|
|
self.redis_key = "ztpc_gdszfcgw_msg"
|
|
|
self.delete_key = ""
|
|
|
self.end_state = False
|
|
|
- self.headers = {
|
|
|
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
- "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
- "Cache-Control": "no-cache",
|
|
|
- "Connection": "keep-alive",
|
|
|
- "Pragma": "no-cache",
|
|
|
- "Upgrade-Insecure-Requests": "1",
|
|
|
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
|
|
- }
|
|
|
-
|
|
|
- def get_html(self, html_source):
|
|
|
- html_js = "".join(re.findall("var siteIdOriginal='';(.*?)\$\('#info_download'\).hide\(\)", html_source, re.S))
|
|
|
- if html_js:
|
|
|
- try:
|
|
|
- trans = "".join(re.findall(r"openTenderCode.replace(.*?);", html_js))
|
|
|
- trans_html = html_js.replace(trans, "('\\\\',\"\")")
|
|
|
- data = "function get_html(){" + trans_html + " return demandAnnouncement }"
|
|
|
- ctx = execjs.compile(data)
|
|
|
- html = ctx.call('get_html')
|
|
|
- return html
|
|
|
- except:
|
|
|
- return None
|
|
|
- else:
|
|
|
- return None
|
|
|
-
|
|
|
- def get_file_list(self, html, proxies=False):
|
|
|
- currPage = "".join(re.findall('var currPage = (.*?);', html))
|
|
|
- pageSize = "".join(re.findall('var pageSize = (.*?);', html))
|
|
|
- Id = "".join(re.findall('var currInfoId = "(.*?)"', html))
|
|
|
- if currPage and pageSize and Id:
|
|
|
- try:
|
|
|
- url = f"https://gdgpo.czt.gd.gov.cn/freecms/rest/v1/notice/selectNoticeDocInfo.do?currPage={currPage}&pageSize={pageSize}0&id={Id}"
|
|
|
- file_res = requests.get(url, headers=self.headers, proxies=proxies, timeout=60, verify=False)
|
|
|
- return file_res.json().get('data')
|
|
|
- except:
|
|
|
- return []
|
|
|
- else:
|
|
|
- return []
|
|
|
|
|
|
def text_search(self, content: str):
|
|
|
SearchText = namedtuple('SearchText', ['total'])
|
|
@@ -85,71 +216,99 @@ class Details:
|
|
|
|
|
|
def detail_get(self, response, item):
|
|
|
response.encoding = response.apparent_encoding
|
|
|
- root = Selector(text=response.text)
|
|
|
- detail_html = root.xpath('/html/body').extract_first()
|
|
|
-
|
|
|
- html = ''
|
|
|
- dxpath_list = ['//div[@id="content"]', '//div[@class="infoCommon"]', '//div[@class="noticeArea"]']
|
|
|
- for xpath in dxpath_list:
|
|
|
- html = root.xpath(xpath).extract_first()
|
|
|
- if html:
|
|
|
- break
|
|
|
-
|
|
|
- publishTime = "".join(re.findall('var publishTime = "(.*?)"', response.text))
|
|
|
- if publishTime:
|
|
|
- year = root.xpath('//input[@id="year"]').extract_first()
|
|
|
- month = root.xpath('//input[@id="month"]').extract_first()
|
|
|
- date = root.xpath('//input[@id="date"]').extract_first()
|
|
|
- y = publishTime.split(' ')[0].split('-')[0]
|
|
|
- m = publishTime.split(' ')[0].split('-')[1]
|
|
|
- d = publishTime.split(' ')[0].split('-')[2]
|
|
|
- html = html.replace(year, y)
|
|
|
- html = html.replace(month, m)
|
|
|
- html = html.replace(date, d)
|
|
|
-
|
|
|
- js_html = self.get_html(detail_html)
|
|
|
- if js_html and self.text_search(html).total < 20:
|
|
|
- html = js_html
|
|
|
+ dt = response.json().get('data')
|
|
|
+ if item.get('noticeType','') == "001101":
|
|
|
+ html = f'''
|
|
|
+ <table style="margin-left: 270px;">
|
|
|
+ <tbody>
|
|
|
+ <tr>
|
|
|
+ <td> 一、采购人: {dt.get('purchaser')}</td>
|
|
|
+ </tr>
|
|
|
+ <tr>
|
|
|
+ <td> 二、采购计划编号:{dt.get('planCodes')}</td>
|
|
|
+ </tr>
|
|
|
+ <tr>
|
|
|
+ <td> 三、采购计划名称:{dt.get('title')}</td>
|
|
|
+ </tr>
|
|
|
+ <tr>
|
|
|
+ <td> 四、采购品目名称: {dt.get('catalogueNameList')}</td>
|
|
|
+ </tr>
|
|
|
+ <tr>
|
|
|
+ <td> 五、采购预算金额(元):{dt.get('budget')}</td>
|
|
|
+ </tr>
|
|
|
+ <tr>
|
|
|
+ <td> 六、需求时间: {dt.get('demandTime')}</td>
|
|
|
+ </tr>
|
|
|
+ <tr>
|
|
|
+ <td> 七、采购方式: {dt.get('purchaseManner')}</td>
|
|
|
+ </tr>
|
|
|
+ <tr>
|
|
|
+ <td> 八、备案时间: {dt.get('recordTime')}</td>
|
|
|
+ </tr>
|
|
|
+ </tbody>
|
|
|
+ </table>
|
|
|
+ '''.replace('None', '')
|
|
|
+ elif item.get('noticeType','') == "001059":
|
|
|
+ html = f'''
|
|
|
+ <div>
|
|
|
+ <div> 一、采购项目名称:<span>{dt.get('title')}</span></div>
|
|
|
+ <div> 二、采购品目名称:<span>{dt.get('catalogueNameList')}</span></div>
|
|
|
+ <div> 三、本公告期限(不得少于5个工作日)自:
|
|
|
+ <span>{dt.get('noticeTime').split(' ')[0]} 至
|
|
|
+ <span>{dt.get('expireTime').split(' ')[0]}
|
|
|
+ </div>
|
|
|
+ <div>
|
|
|
+ 四、任何供应商、单位或者个人对本项目采购需求(征求意见稿)公告有异议的,可以自公告开始之日起至公告期满后5个工作日内将书面意见反馈给采购人、采购代理机构。
|
|
|
+ </div>
|
|
|
+ <div> 五、联系事项
|
|
|
+ <div>
|
|
|
+ <div> (一)采购人:<span>{dt.get('purchaser')}</span></div>
|
|
|
+ <div> 地址:<span>{dt.get('purchaserAddr')}</span></div>
|
|
|
+ <div style="padding-left: 47px;"> 联系人:<span>{dt.get('purchaserLinkMan')}</span></div>
|
|
|
+ <div> 联系电话:<span>{dt.get('purchaserLinkPhone')}</span></div>
|
|
|
+ <div> (二)采购代理机构:<span>{dt.get('agency')}</span></div>
|
|
|
+ <div> 地址:<span>{dt.get('agentAddress')}</span></div>
|
|
|
+ <div style="padding-left: 47px;"> 联系人:<span>{dt.get('agentLinkMan')}</span></div>
|
|
|
+ <div> 联系电话:<span>{dt.get('agentLinkPhone')}</span></div>
|
|
|
+ </div>
|
|
|
+ </div>
|
|
|
+ <div>
|
|
|
+ <div> 发布人:<span>{dt.get('purchaser')}</span></div>
|
|
|
+ <div> 发布时间:<span>{dt.get('noticeTime').split(' ')[0]}</div>
|
|
|
+ </div>
|
|
|
+ </div>
|
|
|
+ '''.replace('None', '')
|
|
|
+ else:
|
|
|
+ html = dt.get('content')
|
|
|
|
|
|
file_name_list = []
|
|
|
-
|
|
|
- file_list = root.xpath('//a[@href]')
|
|
|
+ file_list = dt.get('attchList') or []
|
|
|
attachments = {}
|
|
|
- file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
|
|
|
- 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps']
|
|
|
- if file_list:
|
|
|
- for index, info in enumerate(file_list):
|
|
|
- file_url = info.xpath('./@href').extract_first()
|
|
|
- file_name = info.xpath('./text()').extract_first("").strip()
|
|
|
- file_type = extract_file_type(file_name, file_url)
|
|
|
- if file_type and 'http' in file_url:
|
|
|
- file_name_list.append(file_name)
|
|
|
- attachment = AttachmentDownloader().fetch_attachment(
|
|
|
- file_name=file_name, file_type=file_type, download_url=file_url, proxies=self.proxy)
|
|
|
- attachments[str(len(attachments) + 1)] = attachment
|
|
|
-
|
|
|
- js_file_list = self.get_file_list(detail_html, self.proxy)
|
|
|
- if js_file_list:
|
|
|
- for infoo in js_file_list:
|
|
|
- file_name = infoo.get('fileName').strip()
|
|
|
- file_url = infoo.get('fileUrl').strip()
|
|
|
- file_type = infoo.get('fileExt').strip()
|
|
|
-
|
|
|
- if file_type not in file_types:
|
|
|
- file_type = file_name.split(".")[-1].lower()
|
|
|
-
|
|
|
- if file_type in file_types and 'http' in file_url:
|
|
|
- file_name_list.append(file_name)
|
|
|
+ for info in file_list:
|
|
|
+ file_url = info.get('fileUrl')
|
|
|
+ file_name = info.get('fileName')
|
|
|
+ file_type = extract_file_type(file_name, file_url)
|
|
|
+ if file_type and file_name not in file_name_list:
|
|
|
+ file_name_list.append(file_name)
|
|
|
+ attachment = AttachmentDownloader().fetch_attachment(
|
|
|
+ file_name=file_name, file_type=file_type, download_url=file_url)
|
|
|
+ attachments[str(len(attachments) + 1)] = attachment
|
|
|
+
|
|
|
+ f_list = Selector(text=html).xpath('//a[contains(@href,"upload")]')
|
|
|
+ if f_list:
|
|
|
+ for foo in f_list:
|
|
|
+ f_url = foo.xpath('./@href').extract_first("").strip()
|
|
|
+ f_name = foo.xpath('./text()').extract_first("").strip()
|
|
|
+ f_type = extract_file_type(f_name, f_url)
|
|
|
+ if f_type and f_name not in file_name_list:
|
|
|
+ file_name_list.append(f_name)
|
|
|
attachment = AttachmentDownloader().fetch_attachment(
|
|
|
- file_name=file_name, file_type=file_type, download_url=file_url, proxies=self.proxy)
|
|
|
+ file_name=f_name, file_type=f_type, download_url=f_url)
|
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
|
|
|
|
if attachments:
|
|
|
item['projectinfo'] = {"attachments": attachments}
|
|
|
|
|
|
- rm_list = ['//p[contains(@class,"info-title")]','//div[contains(@class,"info-source")]']
|
|
|
- html = remove_htmldata(rm_list,html,root)
|
|
|
-
|
|
|
new_html = html
|
|
|
for fn in file_name_list:
|
|
|
new_html = new_html.replace(fn, '')
|
|
@@ -165,9 +324,23 @@ class Details:
|
|
|
|
|
|
return True
|
|
|
|
|
|
- def fetch_request(self, item, proxies=False):
|
|
|
- response = requests.get(url=item.get("parse_url"), headers=self.headers,
|
|
|
- proxies=proxies, timeout=60, verify=False)
|
|
|
+ def fetch_request(self, item):
|
|
|
+ nsssjss = ctx().call('get_njs')
|
|
|
+ pms = ctx().call('mme', '/gpcms/rest/web/v2/info/selectInfoForIndex')
|
|
|
+
|
|
|
+ headers = {
|
|
|
+ "Accept": "*/*",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
+ "Connection": "keep-alive",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
|
|
+ "nsssjss": f"{nsssjss}",
|
|
|
+ "sign": f"{pms.get('sign')}",
|
|
|
+ "time": f"{pms.get('time')}",
|
|
|
+ "url": f"{pms.get('url')}"
|
|
|
+ }
|
|
|
+ params =item.get('request_params') or {}
|
|
|
+ response = requests.get(url=item.get("parse_url"), headers=headers,params=params,
|
|
|
+ proxies=self.proxy, timeout=60, verify=False)
|
|
|
return response
|
|
|
|
|
|
def deal_request(self, item):
|
|
@@ -177,7 +350,7 @@ class Details:
|
|
|
org_item = item.copy()
|
|
|
while retry_times < 5:
|
|
|
try:
|
|
|
- response = self.fetch_request(item, self.proxy)
|
|
|
+ response = self.fetch_request(item)
|
|
|
state = response.status_code
|
|
|
if response is not None and state == 200:
|
|
|
self.detail_get(response, item)
|
|
@@ -186,7 +359,7 @@ class Details:
|
|
|
item = org_item
|
|
|
logger.error(f"{item['href']} 异常:{e}")
|
|
|
time.sleep(3)
|
|
|
- self.proxy = get_proxy()
|
|
|
+ self.proxy = get_QGIP()
|
|
|
retry_times += 1
|
|
|
return False
|
|
|
|
|
@@ -214,7 +387,7 @@ class Details:
|
|
|
# logger.debug(item)
|
|
|
if self.end_state:
|
|
|
break
|
|
|
- if count >= 200:
|
|
|
+ if count >= limit:
|
|
|
break
|
|
|
unicode_key = md5value(item.get('href') + item.get('title'))
|
|
|
if not self.rds.hexists(self.redis_key, unicode_key): # 除 动态字段 外所有字段去重
|
|
@@ -235,4 +408,4 @@ class Details:
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- Details().start(limit=1)
|
|
|
+ Details().start(limit=200)
|