dongzhaorui 3 years ago
parent
commit
94f737c0ac
2 changed files with 17 additions and 13 deletions
  1. 2 2
      find_source/crawler/Task.py
  2. 15 11
      find_source/crawler/__init__.py

+ 2 - 2
find_source/crawler/Task.py

@@ -17,7 +17,7 @@ class Task(UserDict):
             url='',
             domain='',
             base_url='',
-            loop_times=0,
+            groups=None,
             sensitive=False,
             duplication=False,
             requirement=False,
@@ -29,7 +29,7 @@ class Task(UserDict):
             url=url,
             domain=domain,
             base_url=base_url,
-            loop_times=loop_times,
+            groups=(groups or ''),
             sensitive=sensitive,
             duplication=duplication,
             requirement=requirement,

+ 15 - 11
find_source/crawler/__init__.py

@@ -61,27 +61,30 @@ class BreadthCrawler:
                 print("关闭寻源爬虫")
                 break
 
-            # TODO 层级管理如何体现在爬虫采集?
-
             task_key, task = tasks
+            groups = task['groups']
             domain = extract_domain(task['url'])
-            visit_domain = self._validator.url(domain)
-            if not visit_domain:
+            allow_visit_domain = self._validator.url(domain)
+            if not allow_visit_domain:
                 continue
-            logger.info(f'准备访问:{domain}')
+
+            logger.info(f'request web site -> {task["url"]}')
             response = self._downloader.get(task['url'])
+            print(response, len(response.text))
             if response.status_code != 200 or response.text in ['']:
                 continue
-            page_source = response.text
+
             task['domain'] = domain
             base_url = extract_base_url(task['url'])
             task['base_url'] = base_url
+            page_source = response.text
             title = extract_page_title(page_source)
+            print(title)
             task['name'] = title
             try:
                 self.verify(task)
                 urls = self._parser(page_source, base_url)
-                new_tasks = [Task(url=url) for url in urls]
+                new_tasks = [Task(url=url, groups=groups) for url in urls]
                 self._scheduler.insert_tasks(new_tasks)
             except HostsRetrieveError:
                 pass
@@ -89,7 +92,7 @@ class BreadthCrawler:
     def set_search_engine(self, engine=None):
         if isinstance(engine, JySearchEngine):
             self._engines.append(engine)
-            logger.info(f'[搜索引擎 - {engine.__class__.__name__}] 添加成功')
+            logger.info(f'[搜索引擎 - {engine.__class__.__name__}]添加成功')
         return self
 
     def set_search_engines(self, engines):
@@ -99,9 +102,10 @@ class BreadthCrawler:
 
     def search_words(self, engine, words):
         for word in words:
-            logger.info(f"[搜索引擎 - {engine.__class__.__name__}]搜索:{word}")
+            word = str(word).replace(' ', '').strip()
+            logger.info(f"[{engine.__class__.__name__} - 搜索]{word}")
             urls = engine.search(word)
-            lst = [Task(url=url) for url in urls]
+            lst = [Task(url=url, groups='keywords') for url in urls]
             self._scheduler.insert_tasks(lst)
 
     def enable_search_engines(self):
@@ -112,7 +116,7 @@ class BreadthCrawler:
 
             futures = []
             for engine in self._engines:
-                logger.info(f"[搜索引擎 - {engine.__class__.__name__}] 启动成功")
+                logger.info(f"[搜索引擎 - {engine.__class__.__name__}]启动成功")
                 f = executor.submit(self.search_words, engine, search_words)
                 f.add_done_callback(err_details)
                 futures.append(f)