|
@@ -1,4 +1,5 @@
|
|
|
import time
|
|
|
+import re
|
|
|
from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
from urllib.parse import urlencode
|
|
|
|
|
@@ -56,8 +57,11 @@ class CrawlListPageSpider:
|
|
|
for node in nodes:
|
|
|
publish_time = str(node.xpath("./td[4]/text()")[0])
|
|
|
l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
|
|
|
+ title = node.xpath("./td[2]/a/text()")[0]
|
|
|
+ if title:
|
|
|
+ title = re.sub(r'([,|.|。|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$', "", title.strip())
|
|
|
info = {
|
|
|
- "title": node.xpath("./td[2]/a/text()")[0],
|
|
|
+ "title": title,
|
|
|
"competehref": node.xpath("./td[2]/a/@href")[0],
|
|
|
"area": node.xpath("./td[1]/a/text()")[0],
|
|
|
"publishtime": publish_time,
|