spider.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. import hashlib
  2. import random
  3. import time
  4. from pathlib import Path
  5. import pandas as pd
  6. import redis
  7. import requests
  8. from loguru import logger
  9. from lxml.html import fromstring, tostring
  10. from pymongo import MongoClient
  11. from selenium import webdriver
  12. from selenium.webdriver import Chrome
  13. from selenium.webdriver.common.by import By
  14. '''MongoDB'''
  15. client = MongoClient('192.168.3.182', 27017)
  16. company_tab = client['national']['company']
  17. '''redis服务'''
  18. r = redis.Redis(
  19. connection_pool=redis.ConnectionPool(
  20. host='192.168.3.182',
  21. port=6379,
  22. password='jianyu@python',
  23. db=10
  24. ),
  25. decode_responses=True
  26. )
  27. redis_key = 'jzsc_2022'
  28. '''日志'''
  29. log_path = (Path(__file__).absolute().parent / 'logs/log_{time:YYYYMMDD}.log').resolve()
  30. logger.add(
  31. log_path,
  32. format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
  33. level='INFO',
  34. rotation='00:00',
  35. retention='1 week',
  36. encoding='utf-8',
  37. )
  38. '''企业资质'''
  39. COMPANY_QUALITY_MAPS = {
  40. '资质类别': 'quality_type',
  41. '资质证书号': 'quality_no',
  42. '资质名称': 'quality_name',
  43. '发证日期': 'fzrq',
  44. '发证有效期': 'fzyxq',
  45. '发证机关': 'fzjg',
  46. }
  47. '''不良行为'''
  48. BAD_BEHAVIOR_MAPS = {
  49. '诚信记录主体及编号': 'integrity_no',
  50. '决定内容': 'decide_content',
  51. '实施部门': 'ssbm',
  52. '决定日期与有效期': 'execution_date',
  53. }
  54. '''黑名单记录'''
  55. BLACK_LIST_MAPS = {
  56. '黑名单记录主体及编号': 'black_list_no',
  57. '黑名单认定依据': 'black_list_rdyj',
  58. '认定部门': 'rdbm',
  59. '决定日期与有效期': 'execution_date',
  60. }
  61. '''失信联合惩戒记录'''
  62. PUNISH_MAPS = {
  63. '失信记录编号': 'punish_no',
  64. '失信联合惩戒记录主体': 'punish_subject',
  65. '法人姓名': 'legal_person',
  66. '列入名单事由': 'reason',
  67. '认定部门': 'rdbm',
  68. '列入日期': 'join_date',
  69. }
  70. CRAWL_SITE = 'http://jzsc.mohurd.gov.cn/data/company'
  71. def sha1(*args):
  72. """
  73. 十六进制数字字符串形式摘要值
  74. @param args: 字符串
  75. @return: 摘要值
  76. """
  77. hash_sha1 = hashlib.sha1()
  78. for arg in args:
  79. hash_sha1.update(arg.encode('utf-8'))
  80. return hash_sha1.hexdigest()
  81. def get_proxy(scheme=None, default=None, socks5h=False):
  82. url = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
  83. headers = {'Authorization': 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'}
  84. try:
  85. proxy = requests.get(url, headers=headers, timeout=15).json()
  86. except requests.RequestException:
  87. return default
  88. if not proxy:
  89. logger.debug('暂无代理...')
  90. return default
  91. proxies = proxy.get('data')
  92. if proxies:
  93. if socks5h:
  94. proxy_items = proxies.get('http')
  95. proxy_h = {
  96. 'http': proxy_items.replace('socks5', 'socks5h'),
  97. 'https': proxy_items.replace('socks5', 'socks5h')
  98. }
  99. proxies = proxy_h
  100. return proxies if not scheme else proxies.get(scheme, default)
  101. def html2element(html):
  102. return fromstring(html)
  103. def element2html(lxml_element):
  104. return tostring(lxml_element, encoding='utf-8').decode()
  105. def display_prompt_popup(html):
  106. _element = html2element(html)
  107. node = _element.xpath('//div[@class="el-dialog__wrapper"]')[0]
  108. _popup_style = node.attrib.get('style')
  109. if _popup_style is not None:
  110. _styles = str(_popup_style).split(';')
  111. res = list(filter(lambda x: len(x) > 0, _styles))[-1].strip().lower()
  112. if res == 'display: none':
  113. '''无提示弹框'''
  114. return False
  115. '''有提示弹框'''
  116. return True
  117. def display_geetest_panel(html):
  118. _element = html2element(html)
  119. node = _element.xpath('//div[@class="geetest_panel_next"]')
  120. if len(node) == 0:
  121. '''无验证码'''
  122. return False
  123. _geetest_panel = node[0]
  124. geetest_style = _geetest_panel.attrib.get('style')
  125. if geetest_style is not None and geetest_style == 'display: block;':
  126. '''有验证码'''
  127. return True
  128. else:
  129. '''无验证码'''
  130. return False
  131. def prompt_popup(driver: Chrome, wait_time=None):
  132. while True:
  133. if not display_prompt_popup(driver.page_source):
  134. break
  135. logger.info(">>> 点击提示弹框")
  136. driver.find_element_by_xpath('//div[@class="el-dialog captchaDilaog"]/div[3]/div/button[1]').click()
  137. time.sleep(1)
  138. '''流程之间的间隔时间'''
  139. _wait_time = (wait_time or 1)
  140. time.sleep(_wait_time)
  141. def geetest_panel(driver: Chrome, wait_time=None, save_img_to_local=False):
  142. while True:
  143. if not display_geetest_panel(driver.page_source):
  144. break
  145. logger.info(">>> 验证码检测")
  146. text = input("通过验证后,结束等待。请输入:y")
  147. if text == 'y':
  148. continue
  149. _wait_time = (wait_time or 1)
  150. time.sleep(wait_time)
  151. def check_page(driver: Chrome, wait_time=None, **kwargs):
  152. """检查页面"""
  153. wait_time = (wait_time or 1)
  154. prompt_popup(driver, wait_time=wait_time)
  155. geetest_panel(
  156. driver,
  157. wait_time=wait_time,
  158. save_img_to_local=kwargs.get('save_img_to_local'),
  159. )
  160. def click(driver: Chrome, button, wait_time=None, allow_check_page=False, run_js=True):
  161. if run_js:
  162. driver.execute_script("arguments[0].click();", button)
  163. else:
  164. button.click()
  165. wait_time = (wait_time or 1)
  166. time.sleep(wait_time)
  167. if allow_check_page:
  168. check_page(driver, wait_time=wait_time)
  169. def click_query(driver: Chrome, wait_time=None):
  170. """查询按钮"""
  171. button = driver.find_element_by_class_name("ssButton")
  172. wait_time = (wait_time or 1)
  173. click(driver, button, wait_time=wait_time)
  174. def next_page(driver: Chrome):
  175. element = html2element(driver.page_source)
  176. node = element.xpath('//button[@class="btn-next"]')[0]
  177. attrib = node.attrib.get('disabled')
  178. if attrib is not None and attrib == 'disabled':
  179. '''最大页码'''
  180. return False
  181. else:
  182. '''继续翻页'''
  183. button = driver.find_element_by_class_name('btn-next')
  184. click(driver, button)
  185. return True
  186. def current_page(html):
  187. element = html2element(html)
  188. nodes = element.xpath('//ul[@class="el-pager"]/li')
  189. for node in nodes:
  190. if node.attrib.get('class') == 'number active':
  191. return node.text
  192. def extract_content(html):
  193. """抽取页面结构化数据"""
  194. results = []
  195. '''字段映射表'''
  196. _maps = {
  197. **COMPANY_QUALITY_MAPS,
  198. **BAD_BEHAVIOR_MAPS,
  199. **BLACK_LIST_MAPS,
  200. **PUNISH_MAPS,
  201. }
  202. '''转化成dataframe'''
  203. dfs = pd.read_html(html)
  204. if len(dfs) == 2:
  205. columns = list(dfs[0].columns.array)
  206. values = dfs[1].values
  207. '''合并内容'''
  208. panel_container = [dict(zip(columns, val)) for val in values]
  209. '''转换字段'''
  210. for item in panel_container:
  211. _item = {}
  212. for key, val in item.items():
  213. if key in _maps:
  214. _item[_maps[key]] = val
  215. results.append(_item)
  216. return results
  217. def crawl_spider(driver: Chrome, handler):
  218. """采集爬虫"""
  219. exception_count = 0
  220. td_elements = driver.find_elements(By.XPATH, value='//table[@class="el-table__body"]//tr/td[3]')
  221. for td_element in td_elements:
  222. if exception_count > 3:
  223. '''数据异常,停止采集'''
  224. return False
  225. title = td_element.text
  226. '''使用公司名称进行去重'''
  227. if r.hexists(redis_key, sha1(title)):
  228. logger.info(f"[重复数据]{title} - 丢弃")
  229. continue
  230. button = td_element.find_element_by_class_name("link")
  231. click(driver, button, wait_time=random.randint(3, 10), run_js=False)
  232. for current_handler in driver.window_handles:
  233. if current_handler == handler:
  234. continue
  235. '''切换到弹出页面'''
  236. driver.switch_to.window(current_handler)
  237. current_url = driver.current_url
  238. '''首次进入详情页,检查提示弹框和验证码面板'''
  239. check_page(driver, wait_time=random.randint(2, 6))
  240. '''企业数据处理'''
  241. company = {}
  242. '''企业基础数据'''
  243. element = html2element(driver.page_source)
  244. nodes = element.xpath('//div[@class="detaile-header__info--table"]')
  245. for node in nodes:
  246. credit_no = "".join(node.xpath('./div[1]/div[1]/div[2]/text()')).strip()
  247. legal_person = "".join(node.xpath('./div[1]/div[2]/div[2]/text()')).strip()
  248. company_type = "".join(node.xpath('./div[2]/div[1]/div[2]/text()')).strip()
  249. address = "".join(node.xpath('./div[2]/div[2]/div[2]/text()')).strip()
  250. business_address = "".join(node.xpath('./div[3]/div[1]/div[2]/text()')).strip()
  251. company = {
  252. 'company_name': title, # 企业名称
  253. 'credit_no': credit_no, # 统一社会信用代码
  254. 'legal_person': legal_person, # 企业法定代表人
  255. 'company_type': company_type, # 企业登记注册类型
  256. 'address': address, # 企业注册属地
  257. 'business_address': business_address, # 企业经营地址
  258. 'industry': '', # 所属行业
  259. 'register_date': '', # 注册时间
  260. 'tel_phone': '', # 联系方式
  261. }
  262. # logger.info(item)
  263. '''企业资质'''
  264. try:
  265. element = html2element(driver.page_source)
  266. node = element.xpath('//div[@class="panel-container"]')[0]
  267. company_quality_html = element2html(node)
  268. company_quality = extract_content(company_quality_html)
  269. company['company_quality'] = company_quality
  270. company['company_quality_html'] = {'html': company_quality_html}
  271. except IndexError:
  272. pass
  273. '''注册人员'''
  274. try:
  275. company_staff = driver.find_element_by_id("tab-companyStaff")
  276. click(driver, company_staff, allow_check_page=True)
  277. reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
  278. logger.info(f'>>> 人员注册类别 <<<')
  279. for element in reg_buttons:
  280. # TODO 页面需翻页的逻辑未添加
  281. logger.info(f'[{element.text}]')
  282. click(driver, element, wait_time=random.randint(1, 3))
  283. registrar = []
  284. element = html2element(driver.page_source)
  285. nodes = element.xpath('//div[@id="pane-companyStaff"]//table[@class="el-table__body"]//tr')
  286. for node in nodes:
  287. name = "".join(node.xpath('./td[2]//span/text()')).strip()
  288. id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
  289. reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
  290. reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
  291. reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
  292. registrar.append({
  293. 'name': name, # 姓名
  294. 'id_no': id_no, # 身份证号
  295. 'reg_type': reg_type, # 注册类别
  296. 'reg_no': reg_no, # 注册号(执业印章号)
  297. 'reg_major': reg_major, # 注册专业
  298. })
  299. company['company_staff'] = registrar
  300. except IndexError:
  301. pass
  302. '''不良行为'''
  303. try:
  304. bad_behavior = driver.find_element_by_id('tab-badBehavior')
  305. click(driver, bad_behavior, allow_check_page=True)
  306. element = html2element(driver.page_source)
  307. node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
  308. bad_behavior_html = element2html(node)
  309. bad_behaviors = extract_content(bad_behavior_html)
  310. company['bad_behavior'] = bad_behaviors
  311. company['bad_behavior_html'] = {'html': bad_behavior_html}
  312. except IndexError:
  313. pass
  314. '''黑名单记录'''
  315. try:
  316. black_list = driver.find_element_by_id('tab-blackList')
  317. click(driver, black_list, allow_check_page=True)
  318. element = html2element(driver.page_source)
  319. node = element.xpath('//div[@id="pane-blackList"]/div')[0]
  320. black_list_html = element2html(node)
  321. black_list_array = extract_content(black_list_html)
  322. company['black_list'] = black_list_array
  323. company['black_list_html'] = {'html': black_list_html}
  324. except IndexError:
  325. pass
  326. '''失信联合惩戒记录'''
  327. try:
  328. punish = driver.find_element_by_id('tab-punishLog')
  329. click(driver, punish, allow_check_page=True)
  330. element = html2element(driver.page_source)
  331. node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
  332. punish_html = element2html(node)
  333. punish_array = extract_content(punish_html)
  334. company['punish'] = punish_array
  335. company['punish_html'] = {'html': punish_html}
  336. except IndexError:
  337. pass
  338. '''保存企业数据'''
  339. if len(company['credit_no']) > 0:
  340. company_tab.insert_one(company)
  341. r.hset(redis_key, sha1(title), title)
  342. logger.info(f'>>> {title} - {current_url} - 采集成功 - 保存入库')
  343. else:
  344. exception_count += 1 # 页面无企业数据
  345. logger.info(f'>>> {title} - {current_url} - 采集失败 - 无社会信用代码')
  346. '''关闭详情页标签'''
  347. driver.close()
  348. '''返回列表页'''
  349. driver.switch_to.window(handler)
  350. '''下一条执行时间'''
  351. time.sleep(2)
  352. else:
  353. return True
  354. def downloader(driver: Chrome, handler):
  355. while True:
  356. logger.info(f">>> 第{current_page(driver.page_source)}页 <<<")
  357. allow_crawl = crawl_spider(driver, handler)
  358. '''是否继续采集'''
  359. if not allow_crawl:
  360. logger.info("网站数据异常,终止采集")
  361. return False
  362. '''翻页'''
  363. if not next_page(driver):
  364. logger.info('采集结束')
  365. break
  366. return True
  367. def select_province(driver: Chrome, records):
  368. """选择注册属地"""
  369. '''点击省份下拉框'''
  370. drop_down_button = driver.find_element_by_xpath('//div[@class="region-select"]/div[1]/div[1]/span[1]/span[1]/i[contains(@class,"el-select__caret el-input__icon el-icon-arrow-up")]')
  371. click(driver, drop_down_button, wait_time=1)
  372. '''选择省份'''
  373. li_elements = driver.find_elements(by=By.XPATH, value='/html/body/div[@class="el-select-dropdown el-popper"][1]/div[1]/div[1]/ul/li')
  374. for element in li_elements:
  375. province = element.text
  376. if province not in records:
  377. logger.info(f'>> 企业注册属地省份:{province} <<')
  378. click(driver, element, wait_time=1.5)
  379. records.append(province)
  380. return False
  381. else:
  382. return True
  383. def select_categories(driver: Chrome, records):
  384. span_elements = driver.find_elements(by=By.XPATH, value='//div[@class="labelInPut labelInPutRadio"]/span')
  385. for element in span_elements:
  386. qualification = element.text
  387. if qualification not in records:
  388. logger.info(f'>> 企业资质类别:{qualification} <<')
  389. records.setdefault(qualification, [])
  390. provinces = records.get(qualification)
  391. if provinces is not None:
  392. if len(provinces) < 32:
  393. click(driver, element, wait_time=1.5)
  394. crawl_finished = select_province(driver, provinces)
  395. if not crawl_finished:
  396. click_query(driver, wait_time=2)
  397. return False
  398. else:
  399. return True
  400. def start(enable_remote_driver=False):
  401. '''
  402. "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir="./data"
  403. '''
  404. options = webdriver.ChromeOptions()
  405. if enable_remote_driver:
  406. options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
  407. options.add_argument("--disable-gpu")
  408. chrome_driver = webdriver.Chrome(
  409. executable_path="/Users/dongzhaorui/Downloads/chromedriver-mac-x64/chromedriver",
  410. options=options
  411. )
  412. main_handler = chrome_driver.current_window_handle # 获取句柄
  413. '''清除其余窗口'''
  414. for handler in chrome_driver.window_handles:
  415. if handler != main_handler:
  416. chrome_driver.switch_to.window(handler)
  417. chrome_driver.close()
  418. chrome_driver.switch_to.window(main_handler)
  419. chrome_driver.get(CRAWL_SITE)
  420. time.sleep(3)
  421. '''采集记录'''
  422. records = {
  423. '全部': None,
  424. '造价咨询企业': None,
  425. }
  426. while True:
  427. crawl_finished = select_categories(chrome_driver, records)
  428. if crawl_finished:
  429. logger.info('任务结束')
  430. break
  431. '''下载数据'''
  432. _continue = downloader(chrome_driver, main_handler)
  433. if not _continue:
  434. break
  435. if not enable_remote_driver:
  436. chrome_driver.quit()
  437. if __name__ == '__main__':
  438. # while True:
  439. # try:
  440. # start(enable_remote_driver=True)
  441. # except:
  442. # logger.info("等待100秒")
  443. # time.sleep(100)
  444. start(enable_remote_driver=True)