|
@@ -17,6 +17,7 @@ import feapder
|
|
|
import feapder.utils.tools as tools
|
|
|
from cookie_pool import WebCookiePool
|
|
|
from encode_info import encode_info
|
|
|
+from feapder import Item
|
|
|
from feapder.db.mongodb import MongoDB
|
|
|
from feapder.network.proxy_pool import swordfish_proxy
|
|
|
from feapder.network.request import requests
|
|
@@ -60,7 +61,6 @@ class DetailSpider(feapder.AirSpider):
|
|
|
cookie_pool = WebCookiePool(redis_key='zgztbcookie',
|
|
|
page_url="http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do")
|
|
|
_to_db = None
|
|
|
- db_name = 'zgzb_list'
|
|
|
|
|
|
@property
|
|
|
def to_db(self):
|
|
@@ -168,15 +168,15 @@ class DetailSpider(feapder.AirSpider):
|
|
|
self._data_transmission_limit = 15 # 数据传输内容接收上限,单位:M, 建议不要超过3M
|
|
|
self._proxies = None # 全局代理
|
|
|
self._cookies = None # 全局浏览器信息
|
|
|
+ self._coll_name = 'data_bak' # 生产表
|
|
|
+ self._task_coll_name = 'zgzb_list' # 任务表
|
|
|
+ self._ignore = ['_id', 'type', 'businessKeyWord', 'rowGuid']
|
|
|
|
|
|
def start_requests(self):
|
|
|
- task_lst = self.to_db.find(self.db_name,
|
|
|
- {"type": "0", "timeout": None},
|
|
|
- sort={"_id": -1},
|
|
|
- limit=100)
|
|
|
self._proxies = self.proxy
|
|
|
+ q = {"type": "0", "timeout": None}
|
|
|
+ task_lst = self.to_db.find(self._task_coll_name, q, sort={"_id": -1}, limit=100)
|
|
|
for item in task_lst:
|
|
|
- item.pop("rowGuid", None)
|
|
|
schemaversion = item.pop("schemaVersion")
|
|
|
try:
|
|
|
businessid, tenderprojectcode, _ = item['href'].split("&")
|
|
@@ -221,11 +221,11 @@ class DetailSpider(feapder.AirSpider):
|
|
|
request.session.cookies.clear_session_cookies()
|
|
|
|
|
|
def parse(self, request, response):
|
|
|
- item = request.item
|
|
|
+ task_item = request.item
|
|
|
meta = request.meta
|
|
|
msg = meta['msg']
|
|
|
href = f"http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid={meta['uuid']}"
|
|
|
- item["href"] = href
|
|
|
+ task_item["href"] = href
|
|
|
request = request.copy() # 复制request实例,复用session会话
|
|
|
response = self.get_response(request, response) # 接收数据
|
|
|
if response.is_html:
|
|
@@ -250,7 +250,9 @@ class DetailSpider(feapder.AirSpider):
|
|
|
yield request
|
|
|
else:
|
|
|
contenthtml, state = self.extract_html(request, response)
|
|
|
- item["contenthtml"] = contenthtml or ''
|
|
|
+ # 删除页面中的图片或者base64
|
|
|
+ contenthtml = tools.re.sub('data:image/(.*?)["|\']', '', contenthtml)
|
|
|
+ task_item["contenthtml"] = contenthtml or ''
|
|
|
|
|
|
# 源码清洗
|
|
|
special = {
|
|
@@ -258,22 +260,25 @@ class DetailSpider(feapder.AirSpider):
|
|
|
'</body[^>]*>|]]>': '',
|
|
|
}
|
|
|
detail = cleaner(contenthtml, special=special) if contenthtml else None
|
|
|
- item["detail"] = detail or ''
|
|
|
+ task_item["detail"] = detail or ''
|
|
|
|
|
|
# 汉字数量检查
|
|
|
flag = "false" if tools.chinese_character(detail).total >= 20 else "true"
|
|
|
- item["sendflag"] = flag
|
|
|
-
|
|
|
- # 更新采集任务状态
|
|
|
- update_data = {"timeout": state.value}
|
|
|
- self.to_db.update(self.db_name, update_data, {"_id": meta['_id']})
|
|
|
-
|
|
|
- # 数据推送生产库
|
|
|
- ignore = ['_id', 'type', 'businessKeyWord']
|
|
|
- insert = {k: v for k, v in item.items() if k not in ignore}
|
|
|
- insert['comeintime'] = tools.ensure_int64(int(time.time()))
|
|
|
- self.to_db.add("data_bak", insert)
|
|
|
- log.info(f"{msg}--采集成功,状态:{state.value}")
|
|
|
+ task_item["sendflag"] = flag
|
|
|
+
|
|
|
+ # 保存data_bak
|
|
|
+ data = {k: v for k, v in task_item.items() if k not in self._ignore}
|
|
|
+ data['comeintime'] = tools.ensure_int64(tools.get_current_timestamp())
|
|
|
+ item = Item(**data)
|
|
|
+ item.table_name = self._coll_name
|
|
|
+ yield item
|
|
|
+ log.info(f"{msg} --上传成功,状态:{state.value}")
|
|
|
+
|
|
|
+ # 更新任务表
|
|
|
+ item = Item(timeout=state.value, _id=meta['_id'])
|
|
|
+ item.table_name = self._task_coll_name
|
|
|
+ yield item.to_UpdateItem()
|
|
|
+ log.debug(f"{msg} --采集完成")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|