1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- #!/usr/bin/python3.6
- # -*- coding: utf-8 -*-
- # @Author : lijunliang
- # @Email : lijunliang@topnet.net.cn
- # @File : abstract.py
- # @Software: PyCharm
- """
- 获取文本摘要
- """
- from gensim.summarization.summarizer import summarize
- import jieba.posseg as psg
- import re
- def split(src: str) -> (int, str):
- """
- 生成满足gensim抽取摘要需要的分词文本。
- :param src:
- :return:
- """
- seg_reg = re.compile("\n+")
- src = seg_reg.sub("", src)
- words = psg.cut(src)
- words = [w + "." if f == 'x' and len(w) > 0 and str(w).endswith('。') else w for w, f in words]
- return len(words), ' '.join(words)
- def make_summary(src: str, ws_max: int = 200, ws_min: int = 50, doc_radio: float = .1) -> str:
- """
- 生成摘要
- :param src:
- :param ws_max:
- :param ws_min:
- :param doc_radio:
- :return:
- """
- l, ws = split(src)
- # 取词200个以内,默认整个文档的10%,但也不能少于50个词
- _ld = int(l * doc_radio)
- _l = max(min(_ld, ws_max), ws_min)
- summary = str(summarize(ws, word_count=_l))
- summary = summary.replace(' ', '').replace('.', '')
- return summary
- if __name__ == '__main__':
- text="本项目位于广州兴丰生活垃圾卫生填埋场内,主要工作内容为电力接入系统建设项目的采购、施工(含竣工试验)、试运行等实行全过程工程承包。包括变压器、高低压开关柜、电力电缆、密集母线、配电箱、电气二次设备、安全监控、防雷接地(沼气发电厂范围)等设备的采购、安装、调试、远动联调、试运行、竣工验收等。另需在现场一期渗沥液处理厂扩容工程10kV高压开关室新增一个高压开关间隔及母线连接,并需完成相关电缆敷设等。 "
- ret=make_summary(text,ws_min=1)
- print(ret)
|