2 years ago · 56e84ed284
--- a/zbytb/crawler/spiders/ListPageSpider.py
+++ b/zbytb/crawler/spiders/ListPageSpider.py
@@ -53,17 +53,24 @@ class CrawlListPageSpider:
 
				         label_info: dict = kwargs.get('label_info')
			
 
				         area_id = kwargs.get('area_id')
			
 
				         element = fromstring(response.text)
			
 
				-        nodes = element.xpath('//div[@class="list-box"]/ul/li')
			
 
				+        nodes = element.xpath('//div[contains(@class,"list-box")]/ul/li')
			
 
				         results = []
			
 
				         for node in nodes:
			
 
				-            publish_time = str(node.xpath('./div[@class="ext-info"]/span[1]/text()')[0])
			
 
				-            l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
			
 
				-            title = node.xpath('./div[@class="title"]/a/text()')[0]
			
 
				+            title = node.xpath('./div[@class="title"]/a/text()')
			
 
				             if title:
			
 
				-                title = re.sub(r'([,|.|。|，|；|;|?|&|$|#|@|!|！|%|*|\'|"|‘|’|“|￥|？| ]*?)$', "", title.strip())
			
 
				+                title = title[0]
			
 
				+                competehref = node.xpath('./div[@class="title"]/a/@href')[0]
			
 
				+                publish_time = "2023-" + str(node.xpath('./div[@class="title"]/span[1]/text()')[0])
			
 
				+            else:
			
 
				+                title = node.xpath('./div[@class="xiangmu"]/a/text()')[0]
			
 
				+                competehref = node.xpath('./div[@class="xiangmu"]/a/@href')[0]
			
 
				+                publish_time = node.xpath('./div[@class="agnmid gzsj"]/text()')[0].replace('/','-')
			
 
				+
			
 
				+            title = re.sub(r'([,|.|。|，|；|;|?|&|$|#|@|!|！|%|*|\'|"|‘|’|“|￥|？| ]*?)$', "", title.strip())
			
 
				+            l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
			
 
				             info = {
			
 
				                 "title": title,
			
 
				-                "competehref": node.xpath('./div[@class="title"]/a/@href')[0],
			
 
				+                "competehref": competehref,
			
 
				                 "area": area_dict.get(f"{area_id}"),
			
 
				                 "publishtime": publish_time,
			
 
				                 "l_np_publishtime": int2long(l_np_publishtime),