dongzhaorui 3 жил өмнө
parent
commit
6fdc1e1a57

+ 11 - 12
find_source/crawler/services/query.py

@@ -21,7 +21,7 @@ class QueryKeyWord(BasicSearch):
         self._interval = (kwargs.pop('query_interval', None) or 60)
         super(QueryKeyWord, self).__init__(**kwargs)
         self.engine = (engine or BingSearchEngine())
-        self._name = engine.__class__.__name__
+        self._name = self.engine.__class__.__name__
 
     def query_keyword(self):
         t_name = threading.currentThread().getName()
@@ -42,15 +42,14 @@ class QueryKeyWord(BasicSearch):
                 urls = self.engine.search(task['search'], cur_page)
                 for url in urls:
                     base_url = extract_base_url(url)
-                    if self.validator.data(base_url):
-                        continue
-                    lst.append(self.make_task(
-                        url=base_url,
-                        origin=task['origin'],
-                        groups=task['groups'],
-                        classify=self.visit_classify,
-                        weight=task['weight'],
-                    ))
+                    if not self.validator.data(base_url):
+                        lst.append(self.make_task(
+                            url=base_url,
+                            origin=task['origin'],
+                            groups=task['groups'],
+                            classify=self.visit_classify,
+                            weight=task['weight'],
+                        ))
                 '''推送数据挖掘队列'''
                 self.scheduler.add_excavate(lst, level=task['weight'])
                 logger.info(f'<{t_name}> - {self._name} - {task["search"]} - 第{cur_page}页,共{len(lst)}条')
@@ -74,7 +73,7 @@ class QueryOrganization(BasicSearch):
         self._interval = (kwargs.pop('query_interval', None) or 60)
         super(QueryOrganization, self).__init__(**kwargs)
         self.engine = (engine or QccSearchEngine())
-        self._name = engine.__class__.__name__
+        self._name = self.engine.__class__.__name__
 
     def query_org(self):
         t_name = threading.currentThread().getName()
@@ -97,7 +96,7 @@ class QueryOrganization(BasicSearch):
                 self.push_query(task)
                 if not is_url(url):
                     continue
-                '''此处通过收录器判断是否已收录网站,再决定是否推送数据挖掘队列'''
+                '''此处通过收录器判断是否已收录网站,再决定是否推送数据挖掘队列'''
                 if self.collector.data(task['domain']):
                     continue
                 '''设置任务为数据挖掘类型'''