|
@@ -53,17 +53,24 @@ class CrawlListPageSpider:
|
|
|
label_info: dict = kwargs.get('label_info')
|
|
|
area_id = kwargs.get('area_id')
|
|
|
element = fromstring(response.text)
|
|
|
- nodes = element.xpath('//div[@class="list-box"]/ul/li')
|
|
|
+ nodes = element.xpath('//div[contains(@class,"list-box")]/ul/li')
|
|
|
results = []
|
|
|
for node in nodes:
|
|
|
- publish_time = str(node.xpath('./div[@class="ext-info"]/span[1]/text()')[0])
|
|
|
- l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
|
|
|
- title = node.xpath('./div[@class="title"]/a/text()')[0]
|
|
|
+ title = node.xpath('./div[@class="title"]/a/text()')
|
|
|
if title:
|
|
|
- title = re.sub(r'([,|.|。|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$', "", title.strip())
|
|
|
+ title = title[0]
|
|
|
+ competehref = node.xpath('./div[@class="title"]/a/@href')[0]
|
|
|
+ publish_time = "2023-" + str(node.xpath('./div[@class="title"]/span[1]/text()')[0])
|
|
|
+ else:
|
|
|
+ title = node.xpath('./div[@class="xiangmu"]/a/text()')[0]
|
|
|
+ competehref = node.xpath('./div[@class="xiangmu"]/a/@href')[0]
|
|
|
+ publish_time = node.xpath('./div[@class="agnmid gzsj"]/text()')[0].replace('/','-')
|
|
|
+
|
|
|
+ title = re.sub(r'([,|.|。|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$', "", title.strip())
|
|
|
+ l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
|
|
|
info = {
|
|
|
"title": title,
|
|
|
- "competehref": node.xpath('./div[@class="title"]/a/@href')[0],
|
|
|
+ "competehref": competehref,
|
|
|
"area": area_dict.get(f"{area_id}"),
|
|
|
"publishtime": publish_time,
|
|
|
"l_np_publishtime": int2long(l_np_publishtime),
|