12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-10-19
- ---------
- @summary:
- ---------
- @author: Dzr
- """
- import time
- from datetime import datetime
- import bson
- from pymongo import MongoClient
- Int64 = bson.int64.Int64
- client = MongoClient('192.168.3.182', 27017)
- qlm_coll = client['zjb_poc']['qlm_data_lst']
- jy_coll = client['zjb_poc']['jy_data_lst']
- count = 0
- insert_lst = []
- with qlm_coll.find() as cursor:
- for item in cursor:
- href = item['url']
- title = item['popTitle'] if 'popTitle' in item else item['showTitle']
- publishtime = item['updateTime']
- l_np_publishtime = datetime.strptime(publishtime, '%Y-%m-%d').timestamp()
- addr = str(item['areaName']).split('-')
- area = addr[0] if len(addr) > 0 else ''
- city = addr[1] if len(addr) > 1 else ''
- if '国土' in item.get('progName', ''):
- toptype = item['progName']
- else:
- toptype = (item['noticeSegmentTypeName'] or item['progName'])
- data = {
- 'site': '千里马',
- 'channel': item['channel'],
- 'spidercode': 'sdxzbiddingsjzypc',
- 'area': area,
- 'city': city,
- 'district': '',
- 'comeintime': Int64(int(time.time())),
- 'isdownload': False, # 是否下载
- 'isfailed': False, # 是否失败
- 'title': title, # 标题
- 'href': href, # 信息链接
- 'publishtime': publishtime, # 发布时间(字符串)
- 'l_np_publishtime': Int64(l_np_publishtime), # 发布时间(时间戳)
- 'buyer': item['tenderees'], # 招标单位
- 'toptype': toptype, # 公告类型
- 'winner': item['bidder'] if item.get('bidder') is not None else '', # 中标单位
- 'agency': item['agent'] if item.get('agent') is not None else '', # 代理单位
- }
- insert_lst.append(data)
- if len(insert_lst) == 100:
- jy_coll.insert_many(insert_lst, ordered=False)
- count += len(insert_lst)
- insert_lst = []
- print('已处理{}条'.format(count))
- if len(insert_lst) > 0:
- jy_coll.insert_many(insert_lst, ordered=False)
- count += len(insert_lst)
- print('已处理{}条'.format(count))
- print('数据处理结束')
|