bidding.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. #!/usr/bin/python3.6
  2. # -*- coding: utf-8 -*-
  3. # @Author : lijunliang
  4. # @Email : lijunliang@topnet.net.cn
  5. # @File : law.py
  6. # @Software: PyCharm
  7. import os
  8. import argparse
  9. import datetime
  10. from module.parse_file import parse_file_start
  11. # from module.abstract import make_summary
  12. from module.price import get_pricing
  13. from loguru import logger
  14. from util.oss_file import OssServeClient
  15. from module.read_config import read_ini
  16. from util.db_helper import DBHelper
  17. from module.sql_operate import md5_exists
  18. from module.sql_operate import save_field
  19. from module.parse_file import get_property
  20. from util.file_operations import generate_directory, del_directory
  21. from module.ac_sensitive import ACAutomation
  22. # from pymongo import MongoClient
  23. from util.convert2img import convert_img
  24. from module.load_classify import load_classify
  25. import pandas as pd
  26. import uuid
  27. logger.add("./logging/run.log", rotation="12:00") # 日志文件
  28. parser = argparse.ArgumentParser("指定监听端口")
  29. parser.add_argument('-dir', '--file_dir', default="./data/file/", type=str, help="目录")
  30. parser.add_argument('-config', '--config_path', default="./data/config.ini", type=str, help="配置文件config.ini")
  31. parser.add_argument('-class', '--docClass', default="招标文件", type=str, help="类别")
  32. parser.add_argument('-tags', '--docTags', default="", type=str, help="标签")
  33. parser.add_argument('-pricing_model', '--pricing_type', default="页数", type=str, help="页数or字数")
  34. parser.add_argument('-pricing', '--base_price', default=500, type=float, help="初始价钱")
  35. # parser.add_argument('-addWater', '--Water', default="0" * 12, type=str, help="用户id")
  36. parser.add_argument('-sensitive_file', '--sensitive_path', default="./data/sensitive_words.txt", type=str,
  37. help="敏感词文件路径")
  38. parser.add_argument('-classify_file', '--classify_path', default="./data/classify.csv", type=str,
  39. help="分类文件")
  40. args = parser.parse_args()
  41. docType = {'doc': 1, 'docx': 1, 'ppt': 4, 'pptx': 4, 'xls': 3, 'xlsx': 3, 'txt': 5, 'pdf': 2, 'html': 2,
  42. 'htm': 2} # 文件类型字典
  43. ACA = ACAutomation()
  44. ACA.parse(args.sensitive_path)
  45. def create_oss_object(oss_config: dict):
  46. """
  47. oss服务初始化函数
  48. :param oss_config:
  49. :return:
  50. """
  51. return OssServeClient(access_key_id=oss_config["access_key_id"],
  52. access_key_secret=oss_config["access_key_secret"],
  53. endpoint=oss_config["endpoint"],
  54. bucket_name=oss_config["bucket_name"])
  55. def link_db():
  56. '''
  57. 连接数据库
  58. :return:
  59. '''
  60. Config = read_ini(args.config_path)
  61. FileConfig = Config["oss_file_config"]
  62. MySqlConfig = Config["mysql_config"]
  63. previewConfig = Config["previewConfig"]
  64. attachConfig = Config["attachments_oss_Config"]
  65. FileOss = create_oss_object(FileConfig) # file文件上传oss
  66. previewOss = create_oss_object(previewConfig) # file文件上传oss
  67. attachOss = create_oss_object(attachConfig) # file文件上传oss
  68. MySql = DBHelper(MySqlConfig)
  69. return FileOss, MySql, previewOss, attachOss
  70. FileOss, MySql, previewOss, attachOss = link_db()
  71. def check_file_type(doctype: str) -> bool:
  72. """
  73. 文件类型检测
  74. :param doctype:
  75. :return:
  76. """
  77. if doctype not in docType:
  78. logger.warning("%s文件类型不匹配---->" % doctype)
  79. return False
  80. return True
  81. @logger.catch
  82. def upload_oss(file_path: str, file_content: str, pdf_path: str, cover_path: str, persistent: dict) -> dict:
  83. """
  84. 文件上传oss
  85. :param file_path: 文件路径
  86. :param file_content: 解析文本
  87. :param persistent: 自定义请求头
  88. :return:
  89. """
  90. global FileOss, previewOss
  91. succeed = {}
  92. source_oss_name = str(uuid.uuid1(int(time.time()))) + "." + file_path.split(".")[-1]
  93. pdf_oss_name = str(uuid.uuid1(int(time.time()))) + "." + "pdf"
  94. cover_oss_name = str(uuid.uuid1(int(time.time()))) + "." + "png"
  95. text_oss_name = str(uuid.uuid1(int(time.time())))
  96. per_header = FileOss.create_oss_meta(persistent)
  97. if not per_header:
  98. per_header = None
  99. # 源文件上传
  100. with open(file_path, "rb") as file:
  101. state, request_id = FileOss.upload_bytes_file(source_oss_name, file, headers=per_header)
  102. if state:
  103. succeed["ossDocId"] = source_oss_name
  104. # pdf文件上传
  105. with open(pdf_path, "rb") as pdf:
  106. state, request_id = FileOss.upload_bytes_file(pdf_oss_name, pdf, headers=per_header)
  107. if state:
  108. succeed["ossPdfId"] = pdf_oss_name
  109. # 文本文件上传
  110. state, request_id = FileOss.upload_text_file(text_oss_name, file_content, headers=per_header)
  111. if state:
  112. succeed["ossTxtId"] = text_oss_name
  113. # 封面图片上传
  114. with open(cover_path, "rb") as cover:
  115. state, request_id = previewOss.upload_bytes_file(cover_oss_name, cover, headers=per_header)
  116. if state:
  117. succeed["previewImgId"] = cover_oss_name
  118. return succeed
  119. def get_field(file_path: str, persistent: dict):
  120. '''
  121. 文件获取重要字段
  122. :param file_path:
  123. :param file_md5:
  124. :param persistent:自定义请求头字段
  125. :return:
  126. '''
  127. field = {}
  128. # 解析文件,获得文本文档
  129. file_content, pages, pdf_path = parse_file_start(file_path)
  130. text_size = len(file_content)
  131. if text_size < 400: # 检测文本长度检测决定是否上传
  132. return {}
  133. if pages < 1:
  134. return {}
  135. # search = ACA.search(file_content) #敏感词检查
  136. # if search:
  137. # return field
  138. cover_path = convert_img(pdf_path)
  139. if not cover_path:
  140. return {}
  141. field["docPageSize"] = pages
  142. # 上传成功的文件,字段在函数内定义
  143. upload_ret = upload_oss(file_path, file_content, pdf_path, cover_path, persistent)
  144. if not upload_ret:
  145. return {}
  146. field.update(upload_ret)
  147. # 获得摘要
  148. # try:
  149. # summary = make_summary(file_content) # 获得摘要
  150. # except Exception as e:
  151. # logger.warning("摘要提取失败-->%s" % file_content)
  152. # summary = ""
  153. field["docSummary"] = file_content[:500]
  154. # 获得价钱
  155. return field
  156. def other_save(ossid: str):
  157. '''
  158. :param paths:
  159. :param filename:
  160. :return:
  161. '''
  162. try:
  163. global attachOss
  164. abs_dir = os.path.abspath(".")
  165. target_dir = os.path.join(abs_dir, "data/folder/")
  166. if os.path.exists(target_dir):
  167. del_directory(target_dir)
  168. generate_directory(target_dir)
  169. target_path = os.path.join(target_dir, ossid)
  170. state = attachOss.download_file(ossid, target_path)
  171. if not state:
  172. return False, target_path
  173. except Exception as e:
  174. print(e)
  175. return False, ""
  176. return True, target_path
  177. def get_classify(sub_class: str):
  178. '''
  179. 获取全部的分类
  180. :param sub_class:
  181. :return:
  182. '''
  183. total_classify = []
  184. classifies = sub_class.split(",")
  185. for classify in classifies:
  186. base_classify = [args.docClass]
  187. classify = classify.strip()
  188. if classify:
  189. for val in classify.split("_"):
  190. base_classify.append(val)
  191. if len(base_classify) > 1:
  192. total_classify.append("/".join(base_classify))
  193. return total_classify
  194. @logger.catch
  195. def walk_dir_start():
  196. """
  197. 生成开始
  198. :param file_dir:
  199. :return:
  200. """
  201. FileOss, MySql, previewOss, attachOss = link_db()
  202. classify_dict = load_classify(args.classify_path)
  203. mongo_info = pd.read_csv("./data/data0_doc.csv")
  204. for row in mongo_info.values.tolist():
  205. print("id--->", row[0])
  206. title = row[4]
  207. title = "".join(title.split(".")[:-1])
  208. if len(title) < 13:
  209. title = row[7]
  210. title = title.strip()
  211. if not title:
  212. title = row[1]
  213. title = title.strip()
  214. if not title: continue
  215. need_field = {}
  216. need_field["docName"] = title
  217. sub_class = row[3]
  218. ossid = row[5]
  219. if not sub_class:
  220. continue
  221. state, target_path = other_save(ossid)
  222. if not state:
  223. continue
  224. doctype, suffix, fileSize = get_property(target_path)
  225. if not check_file_type(doctype): # 类型检查
  226. continue
  227. need_field["docFileType"] = docType[doctype] # 文件类型
  228. need_field["docFileSuffix"] = suffix # 文件后缀
  229. need_field["docFileSize"] = fileSize # 文件大小
  230. state, file_md5 = md5_exists(MySql, target_path) # md5检查去重
  231. need_field["md5"] = file_md5
  232. if state:
  233. logger.warning("%s已经存在--------》" % title)
  234. continue
  235. field = get_field(target_path, {})
  236. if not field:
  237. logger.warning("储存失败--->%s" % row[0])
  238. continue
  239. need_field.update(field)
  240. if row[6]:
  241. need_field["uploadDate"] = datetime.datetime.fromtimestamp(row[6])
  242. else:
  243. need_field["uploadDate"] = datetime.datetime.now()
  244. need_field["isDelete"] = 0
  245. need_field["downOrUp"] = 1
  246. doctypes = get_classify(sub_class)
  247. docClass = ",".join([classify_dict[doctype] for doctype in doctypes if doctype in classify_dict])
  248. if not docClass:
  249. continue
  250. need_field["docTags"] = ",".join(set([s for v in doctypes for s in v.split("/")])) # 获取输入标签
  251. need_field["docClass"] = docClass # 获取输入类别
  252. need_field["userId"] = row[0]
  253. need_field["appId"] = "10000"
  254. pages = need_field["docPageSize"]
  255. price = get_pricing(docClass, pages)
  256. need_field["price"] = price
  257. save_field(MySql, need_field) # 保存到mysql
  258. if __name__ == '__main__':
  259. filepath = "./files"
  260. import time
  261. # ret = load_classify(args.classify_path)
  262. # print(ret)
  263. start = time.time()
  264. walk_dir_start()
  265. end = time.time()
  266. # print(end - start)