Эх сурвалжийг харах

中国招标与采购网列表页维护

lizongze 2 жил өмнө
parent
commit
56e84ed284

+ 13 - 6
zbytb/crawler/spiders/ListPageSpider.py

@@ -53,17 +53,24 @@ class CrawlListPageSpider:
         label_info: dict = kwargs.get('label_info')
         area_id = kwargs.get('area_id')
         element = fromstring(response.text)
-        nodes = element.xpath('//div[@class="list-box"]/ul/li')
+        nodes = element.xpath('//div[contains(@class,"list-box")]/ul/li')
         results = []
         for node in nodes:
-            publish_time = str(node.xpath('./div[@class="ext-info"]/span[1]/text()')[0])
-            l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
-            title = node.xpath('./div[@class="title"]/a/text()')[0]
+            title = node.xpath('./div[@class="title"]/a/text()')
             if title:
-                title = re.sub(r'([,|.|。|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$', "", title.strip())
+                title = title[0]
+                competehref = node.xpath('./div[@class="title"]/a/@href')[0]
+                publish_time = "2023-" + str(node.xpath('./div[@class="title"]/span[1]/text()')[0])
+            else:
+                title = node.xpath('./div[@class="xiangmu"]/a/text()')[0]
+                competehref = node.xpath('./div[@class="xiangmu"]/a/@href')[0]
+                publish_time = node.xpath('./div[@class="agnmid gzsj"]/text()')[0].replace('/','-')
+
+            title = re.sub(r'([,|.|。|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$', "", title.strip())
+            l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
             info = {
                 "title": title,
-                "competehref": node.xpath('./div[@class="title"]/a/@href')[0],
+                "competehref": competehref,
                 "area": area_dict.get(f"{area_id}"),
                 "publishtime": publish_time,
                 "l_np_publishtime": int2long(l_np_publishtime),