|
@@ -14,7 +14,7 @@ from utils.databases import mongo_table, int2long, es_query, redis_client
|
|
|
from utils.execptions import CrawlError, YbwCrawlError
|
|
|
from utils.log import logger
|
|
|
from utils.socks5 import Proxy
|
|
|
-from utils.tools import sha1
|
|
|
+from utils.tools import sha1,clean_title
|
|
|
|
|
|
CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
|
|
|
|
|
@@ -115,7 +115,7 @@ class ListSpider:
|
|
|
if '-' not in publish_time:
|
|
|
publish_time = "".join(node.xpath('./td[6]/text()')).strip()
|
|
|
area = "".join("".join(node.xpath('./td[5]/text()')).split())
|
|
|
- title = "".join("".join(node.xpath('./td[2]/a/text()')).split())
|
|
|
+ title = clean_title("".join("".join(node.xpath('./td[2]/a/text()')).split()))
|
|
|
competehref = 'https://www.chinabidding.cn{}'.format("".join(node.xpath('./td[2]/a/@href')))
|
|
|
item = {
|
|
|
"site": "元博网(采购与招标网)",
|