division.go 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991
  1. package pretreated
  2. import (
  3. "fmt"
  4. "jy/clear"
  5. "jy/util"
  6. qutil "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "unicode/utf8"
  12. )
  13. //分块、分段功能
  14. var (
  15. /*regSerialTitles = []string{
  16. "([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)",
  17. "[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)",
  18. "(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)",
  19. "(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)",
  20. "(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
  21. "1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)",
  22. }*/
  23. regSerialTitles_1 = []*regexp.Regexp{
  24. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)"),
  25. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)"),
  26. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
  27. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)"),
  28. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)"),
  29. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)"),
  30. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s(]*|^[\u3000\u2003\u00a0\\s(]*)(\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)"),
  31. }
  32. regSerialTitles_2 = []*regexp.Regexp{
  33. regexp.MustCompile("^([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)$"),
  34. regexp.MustCompile("^[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)$"),
  35. regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)$"),
  36. regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)$"),
  37. regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
  38. regexp.MustCompile("^1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)$"),
  39. regexp.MustCompile("^[(](\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)$"),
  40. }
  41. regReplAllTd = regexp.MustCompile("(?smi)<td.*?>.+?</td>")
  42. regIsNumber = regexp.MustCompile("^\\d+$")
  43. regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
  44. regReplAllSpace = regexp.MustCompile("[\u3000\u2003\u00a0\\s]+")
  45. regTrimSpace = regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
  46. regReplWrapSpace = regexp.MustCompile("^[\r\n][\u3000\u2003\u00a0\\s]*|[\r\n][\u3000\u2003\u00a0\\s]*$")
  47. regReplAllSymbol = regexp.MustCompile("[(\\(<《【\\[{{〔)\\)>》】\\]}}〕,,;;::'\"“”。.\\??/+=\\-_——*&……\\^%$¥@#!!`~·]")
  48. regFilterTitle = regexp.MustCompile("[(\\(<《【\\[{{〔].+?[)\\)>》】\\]}}〕]")
  49. regDivision = regexp.MustCompile("[::]")
  50. regSpliteSegment = regexp.MustCompile("[\r\n]")
  51. regFilterNumber = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
  52. regSplit = regexp.MustCompile("或|和|以?及|与|、|或")
  53. regStartWrap = regexp.MustCompile("^[\r\n]")
  54. regEndWrap = regexp.MustCompile("[\r\n]$")
  55. regMoreWrap = regexp.MustCompile("[\r\n]{2,}")
  56. regStrWrap = regexp.MustCompile("分包名称[::]")
  57. regBZJWarap = regexp.MustCompile("(保证金.*|每包[0-9]*元|标志|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]标室)")
  58. regFJWarap = regexp.MustCompile("[a-zA-Z0-9](包|标段)[公告附件]*.(pdf|PDF|docx|doc|DOCX|DOC)")
  59. regAZWarap = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包)")
  60. replSerial = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
  61. moreColonReg = regexp.MustCompile("[::]+")
  62. regFilter = regexp.MustCompile("等$")
  63. pkgFilter = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?(标|包)(段|号)?")
  64. indexTile = regexp.MustCompile("[0-9.]{2,3}[\\s\u4e00-\u9fa5]{2,8}[::]+") //小标题
  65. indexTile2 = regexp.MustCompile("[\\s\u4e00-\u9fa5]{2,8}")
  66. regReplAllSpace2 = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
  67. confusion = map[string]string{
  68. "参与": "canyu",
  69. }
  70. //查找分包之前,先对内容进行预处理
  71. /*
  72. 第一包:采购设备清单
  73. <table></table>
  74. */
  75. regPackageFilter = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
  76. filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
  77. xuhao = map[string]bool{
  78. "19968_12289": true,
  79. "19968_46": true,
  80. "20108_12289": true,
  81. "20108_46": true,
  82. "19977_12289": true,
  83. "19977_46": true,
  84. "22235_12289": true,
  85. "22235_46": true,
  86. "20116_12289": true,
  87. "20116_46": true,
  88. "20845_12289": true,
  89. "20845_46": true,
  90. "19971_12289": true,
  91. "19971_46": true,
  92. "20843_12289": true,
  93. "20061_46": true,
  94. }
  95. )
  96. //分块
  97. func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) ([]*util.Block, int) {
  98. defer qutil.Catch()
  99. returnValue := 0
  100. var blocks []*util.Block
  101. if strings.TrimSpace(content) == "" {
  102. return blocks, -1
  103. }
  104. //table里面的内容不考虑,先把table清理掉
  105. //contentTemp := regReplAllTd.ReplaceAllString(content, "")
  106. contentTemp := TextAfterRemoveTable(content)
  107. tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
  108. var regContenSerialTitle *regexp.Regexp
  109. var regSerialTitleIndex int
  110. if ruleBlock != nil && len(ruleBlock.BlockRegs) > 0 {
  111. regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, ruleBlock.BlockRegs)
  112. } else {
  113. regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, regSerialTitles_1)
  114. }
  115. //没有分块
  116. if regSerialTitleIndex == -1 {
  117. if len(contentTemp) == len(content) {
  118. //没有分块
  119. return blocks, -1
  120. } else { //有table
  121. return blocks, -2
  122. }
  123. }
  124. //匹配序号和标题
  125. var regSerialTitle *regexp.Regexp
  126. if ruleBlock != nil && len(ruleBlock.TitleRegs) > 0 {
  127. regSerialTitle = ruleBlock.TitleRegs[regSerialTitleIndex]
  128. } else {
  129. regSerialTitle = regSerialTitles_2[regSerialTitleIndex]
  130. }
  131. indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
  132. indexs = filterSerial(content, indexs, tdIndexs)
  133. //头块
  134. var headBlock, endBlock *util.Block
  135. currentIndex := 0
  136. for k, v := range indexs {
  137. start, end := v[0], v[1]
  138. //添加开头部分
  139. if k == 0 {
  140. if headTemp := content[:start]; regReplAllSpace.ReplaceAllString(headTemp, "") != "" {
  141. headBlock = &util.Block{
  142. Index: -1, //序号
  143. Text: headTemp, //内容
  144. Title: "", //标题
  145. Start: 0,
  146. End: start,
  147. }
  148. }
  149. }
  150. //分块
  151. blockSerialTitle := regTrimSpace.ReplaceAllString(content[start:end], "")
  152. serialTitles := regSerialTitle.FindStringSubmatch(blockSerialTitle) //序号和标题
  153. if len(serialTitles) < 3 {
  154. continue
  155. }
  156. indexSting := regReplAllSpace.ReplaceAllString(serialTitles[1], "") //序号
  157. index := 0
  158. //转成数字序号
  159. if regIsNumber.MatchString(indexSting) {
  160. index, _ = strconv.Atoi(indexSting)
  161. } else if regIsChineseNumber.MatchString(indexSting) {
  162. index = util.ChineseNumberToInt(indexSting)
  163. }
  164. //序号开始就是错误的
  165. if k+1 != index {
  166. if k == 0 {
  167. returnValue = 3
  168. break
  169. } else {
  170. if currentIndex+1 != index {
  171. //如果序号不是连续的,不往下走
  172. returnValue = 2
  173. //添加结尾部分
  174. if from != 3 {
  175. endBlock = &util.Block{
  176. Index: -2, //序号
  177. Text: content[start:], //内容
  178. Title: "", //标题
  179. Start: start,
  180. End: len(content),
  181. }
  182. break
  183. }
  184. }
  185. }
  186. currentIndex = index
  187. }
  188. //
  189. title := serialTitles[2] //标题
  190. title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格
  191. //分块后的块文
  192. nextStart := len(content)
  193. if k < len(indexs)-1 {
  194. nextStart = indexs[k+1][0]
  195. }
  196. //获取块中除了序号和标题的内容
  197. blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
  198. if title != "" {
  199. blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
  200. //特殊情况处理
  201. if blockTextTemp == "" {
  202. if regDivision.MatchString(title) {
  203. /*
  204. 一、项目编号:HMEC170223
  205. 二、项目名称:执法记录仪采购
  206. */
  207. blockText = title
  208. divisionIndexs := regDivision.FindStringIndex(title)
  209. title = title[:divisionIndexs[0]]
  210. } else {
  211. /*
  212. 十一、投标代表须持本人身份证原件亲自递交投标文件,代理机构项目经理审核通过后,办理签收手续,否则投标文件被拒收。
  213. 十二、开标时间:2017年3月20日9时30分
  214. */
  215. blockText = title
  216. title = ""
  217. }
  218. } else if blockTextTemp != "" && regDivision.MatchString(title) {
  219. /*
  220. 2、采购单位名称:福建省汀州医院
  221. 采购单位地址: 龙岩市长汀县
  222. 联系人:胡科长
  223. 联系方式:0597-6826353
  224. */
  225. //多个标题
  226. divisionIndexs := regDivision.FindStringIndex(title)
  227. titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
  228. titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
  229. blockText = title + "\n" + blockText
  230. if titleAfter != "" {
  231. title = ""
  232. } else {
  233. title = titleBefore
  234. }
  235. } else {
  236. blockText = title + "\n" + blockText
  237. }
  238. }
  239. //没有内容的块,不打标签,不分段
  240. if blockText == "" {
  241. continue
  242. }
  243. //过滤
  244. if regexp.MustCompile("投标文件格式|业绩").MatchString(title) {
  245. continue
  246. }
  247. blockText = hasMergeKV(title, blockText)
  248. //
  249. titleIsExists := map[string]bool{} //去重
  250. title = filterTitle(title)
  251. //分割标题 [和及]。。。 参与
  252. splitTitles := ProcTitle(title)
  253. block := &util.Block{
  254. Index: index, //序号
  255. Text: blockText, //内容
  256. Title: title, //标题
  257. Titles: splitTitles,
  258. Start: start,
  259. End: nextStart,
  260. }
  261. titles := []string{}
  262. for _, sv := range splitTitles {
  263. if sv == "" || titleIsExists[sv] {
  264. continue
  265. }
  266. titleIsExists[sv] = true
  267. //标题过短过长不打标签
  268. if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
  269. //打标签
  270. block.Tags = append(block.Tags, util.GetBlockTags(sv))
  271. titles = append(titles, sv)
  272. }
  273. }
  274. block.Title = title
  275. block.Titles = titles
  276. if ruleBlock != nil {
  277. block.Classify, block.NotClassifyTitles = ruleBlock.Classify.GetClassify(tp, titles)
  278. }
  279. tagsToBlocks(blocks, block)
  280. //log.Println(index, sv, splitTitles)
  281. //log.Println(blockText)
  282. blocks = append(blocks, block)
  283. }
  284. var returnBlocks []*util.Block
  285. if len(blocks) > 0 {
  286. //头
  287. if headBlock != nil {
  288. if tp == "招标" {
  289. headBlock.Classify = map[string]bool{"bidcondition": true}
  290. }
  291. returnBlocks = append(returnBlocks, headBlock)
  292. }
  293. //中间块
  294. returnBlocks = append(returnBlocks, blocks...)
  295. //尾
  296. if endBlock != nil {
  297. returnBlocks = append(returnBlocks, endBlock)
  298. }
  299. if returnValue == 0 {
  300. returnValue = 1
  301. }
  302. }
  303. contactFormat := &util.ContactFormat{
  304. IndexMap: map[int]string{},
  305. MatchMap: map[string]map[string]bool{},
  306. }
  307. for _, bl := range returnBlocks {
  308. //解析kv
  309. newText := TextAfterRemoveTable(bl.Text)
  310. bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from, isSite, codeSite)
  311. bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat, isSite, codeSite)
  312. //正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号
  313. bl.Text = appendWarpStop(bl.Text)
  314. }
  315. return returnBlocks, returnValue
  316. }
  317. //块标题处理
  318. func ProcTitle(title string) []string {
  319. if title == "" {
  320. return []string{}
  321. }
  322. for k, v := range confusion {
  323. title = strings.Replace(title, k, v, -1)
  324. }
  325. direct := 1
  326. prev := ""
  327. ara := regSplit.Split(title, -1)
  328. for kk, vv := range ara {
  329. for kkk, vvv := range confusion {
  330. vv = strings.Replace(vv, vvv, kkk, -1)
  331. }
  332. ara[kk] = vv
  333. if len([]rune(vv)) == 2 {
  334. if kk == 0 {
  335. direct = -1
  336. } else {
  337. start := ""
  338. if len([]rune(prev)) > 3 {
  339. start = string([]rune(prev)[:len([]rune(prev))-2])
  340. }
  341. ara[kk] = start + vv
  342. }
  343. }
  344. if len([]rune(vv)) > 3 {
  345. if direct == -1 {
  346. end := string([]rune(vv)[len([]rune(vv))-2:])
  347. for i := 0; i < kk; i++ {
  348. ara[i] = ara[i] + end
  349. }
  350. break
  351. }
  352. prev = vv
  353. }
  354. }
  355. return ara
  356. }
  357. //有合并kv的 例如项目名称及编号
  358. func hasMergeKV(title, text string) string {
  359. title = regDivision.ReplaceAllString(title, "")
  360. titles := regSplit.Split(title, -1)
  361. if len(titles) <= 1 {
  362. return text
  363. }
  364. before := titles[0]
  365. after := titles[1]
  366. if strings.Contains(title, "项目") && len([]rune(after)) == 2 {
  367. after = "项目" + after
  368. } else {
  369. return text
  370. }
  371. if strings.Count(text, "\n") != 1 {
  372. return text
  373. }
  374. texts := strings.Split(text, "\n")
  375. textOneLine := texts[0]
  376. textTwoLine := texts[1]
  377. if regDivision.MatchString(textTwoLine) {
  378. return text
  379. }
  380. if textTwoLine := strings.SplitN(textTwoLine, ",", 2); len(textTwoLine) == 2 {
  381. text = textOneLine + "\n" + before + ":" + textTwoLine[0] + "," + after + ":" + textTwoLine[1]
  382. }
  383. return text
  384. }
  385. //过滤序号,判断序号是不是在td里,如果是的话这个序号作废
  386. func filterSerial(content string, indexs, tdIndexs [][]int) [][]int {
  387. returnIndexs := [][]int{}
  388. for _, v := range indexs {
  389. flag := false
  390. //根据序号的开始位置,判断是不是在td里面
  391. for _, tv := range tdIndexs {
  392. if v[0] > tv[0] && v[0] < tv[1] {
  393. flag = true
  394. continue
  395. }
  396. }
  397. if flag {
  398. continue
  399. }
  400. returnIndexs = append(returnIndexs, []int{v[0], v[1]})
  401. }
  402. return returnIndexs
  403. }
  404. //获取正文所用的序号类型
  405. func getSerialType(content string, blockRegs []*regexp.Regexp) (*regexp.Regexp, int) {
  406. var regContenSerialTitle *regexp.Regexp
  407. //先判断文章最外层使用的是哪种序号
  408. contentStartIndex, regSerialTitleIndex := -1, -1
  409. for k, v := range blockRegs {
  410. indexs := v.FindStringIndex(content)
  411. //只用最外层的序号,里面的过滤掉
  412. if len(indexs) == 2 && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) {
  413. regSerialTitleIndex = k
  414. contentStartIndex = indexs[0]
  415. regContenSerialTitle = v
  416. }
  417. }
  418. return regContenSerialTitle, regSerialTitleIndex
  419. }
  420. //添加换行和句号
  421. func appendWarpStop(text string) string {
  422. //清理前后空格
  423. text = regTrimSpace.ReplaceAllString(text, "")
  424. //添加句号
  425. if !strings.HasSuffix(text, "。") {
  426. text += "。"
  427. }
  428. //添加换行
  429. if !regEndWrap.MatchString(text) {
  430. text += "\n"
  431. }
  432. return text
  433. }
  434. //分段
  435. func DivideSegmentHtml(txt string) []*util.Segment {
  436. //先分段
  437. _segs := strings.FieldsFunc(txt, func(r rune) bool {
  438. return r == 10 || r == 13
  439. })
  440. //再去除空行
  441. segs := make([]*util.Segment, 0)
  442. _index := 0
  443. for _, seg := range _segs {
  444. if seg != " " && len(seg) > 1 {
  445. _seg := util.Segment{}
  446. _index = _index + 1
  447. _seg.Index = _index
  448. _seg.Text = seg
  449. segs = append(segs, &_seg)
  450. }
  451. }
  452. return segs
  453. }
  454. //分段
  455. func DivideSegment(txt string) []*util.Segment {
  456. //先分段
  457. tmpstr := ""
  458. _segs := strings.FieldsFunc(txt, func(r rune) bool {
  459. if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
  460. r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
  461. if tmpstr == "" {
  462. tmpstr += fmt.Sprint(r)
  463. return false
  464. } else if strings.Contains(tmpstr, "_") {
  465. tmpstr = ""
  466. tmpstr += fmt.Sprint(r)
  467. return false
  468. } else if tmpstr == fmt.Sprint(r) {
  469. if r == 46 || r == 12289 {
  470. tmpstr = ""
  471. }
  472. return false
  473. }
  474. tmpstr += "_" + fmt.Sprint(r)
  475. if xuhao[tmpstr] {
  476. return true
  477. }
  478. }
  479. tmpstr = ""
  480. return r == 10 || r == 13
  481. })
  482. //再去除空行
  483. segs := make([]*util.Segment, 0)
  484. _index := 0
  485. for _, seg := range _segs {
  486. if seg != " " && len(seg) > 1 {
  487. _seg := util.Segment{}
  488. _index = _index + 1
  489. _seg.Index = _index
  490. _seg.Text = seg
  491. segs = append(segs, &_seg)
  492. }
  493. }
  494. return segs
  495. }
  496. /** 给块打标签 **/
  497. func tagsToBlocks(blocks []*util.Block, block *util.Block) {
  498. if len(block.Tags) == 0 {
  499. return
  500. }
  501. tag := map[string]bool{}
  502. tagWeight := map[string]int{}
  503. for _, v := range block.Tags {
  504. for _, ts := range v {
  505. tag[ts.Value] = true
  506. tagWeight[ts.Value] = ts.Weight
  507. }
  508. }
  509. for v, _ := range tag {
  510. for _, block := range blocks {
  511. if block.Tag[v] {
  512. for _, blockTags := range block.Tags {
  513. for _, ts := range blockTags {
  514. if ts.Value == v && ts.Weight < tagWeight[v] {
  515. block.Tag[v] = false
  516. }
  517. }
  518. }
  519. }
  520. }
  521. }
  522. block.Tag = tag
  523. }
  524. func filterTitle(title string) string {
  525. if strings.Contains(title, ",") && strings.Contains(title, "。") {
  526. return ""
  527. }
  528. if len([]rune(title)) > 30 {
  529. return ""
  530. }
  531. //清理空格
  532. title = regReplAllSpace.ReplaceAllString(title, "")
  533. //清理成对出现的符号中的内容
  534. title = regFilterTitle.ReplaceAllString(title, "")
  535. //清理特殊符号
  536. title = regReplAllSymbol.ReplaceAllString(title, "")
  537. //清理序号
  538. title = regFilterNumber.ReplaceAllString(title, "")
  539. title = regFilter.ReplaceAllString(title, "")
  540. return title
  541. }
  542. //从块里面找分包
  543. func FindPackageFromBlocks(blocks *[]*util.Block, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
  544. blockPackage = map[string]*util.BlockPackage{}
  545. //块分包
  546. for _, v := range *blocks {
  547. text := regPackageFilter.ReplaceAllString(v.Text, "<table>")
  548. text = TextAfterRemoveTable(text)
  549. if text == "" {
  550. continue
  551. }
  552. //var ok bool
  553. //var surplusText string
  554. divisionPackageChild(&blockPackage, text, v.Title, true, v.Tag["中标单位"], isSite, codeSite)
  555. }
  556. //orderwinner := winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
  557. for k, v := range blockPackage {
  558. findWinnerBugetBidmountByKv(v, blockPackage, k)
  559. }
  560. return
  561. }
  562. func findWinnerBugetBidmountByKv(v *util.BlockPackage, blockPackage map[string]*util.BlockPackage, k string) {
  563. if v.ColonKV != nil && v.ColonKV.KvTags != nil {
  564. for kc, cv := range v.ColonKV.KvTags {
  565. if kc == "预算" && v.Budget <= 0 {
  566. moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
  567. if len(moneys) > 0 {
  568. if vf, ok := moneys[0].(float64); ok {
  569. blockPackage[k].Budget = vf
  570. blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
  571. } else if vi, ok := moneys[0].(int); ok {
  572. blockPackage[k].Budget = float64(vi)
  573. blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
  574. }
  575. }
  576. } else if kc == "中标金额" && v.Bidamount <= 0 {
  577. moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
  578. if len(moneys) > 0 {
  579. if vf, ok := moneys[0].(float64); ok {
  580. blockPackage[k].Bidamount = vf
  581. blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
  582. } else if vi, ok := moneys[0].(int); ok {
  583. blockPackage[k].Bidamount = float64(vi)
  584. blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
  585. }
  586. }
  587. } else if kc == "中标单位" && v.Winner == "" {
  588. blockPackage[k].Winner = cv[0].Value
  589. }
  590. }
  591. }
  592. if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
  593. for kc, cv := range v.SpaceKV.KvTags {
  594. if kc == "预算" && v.Budget <= 0 {
  595. moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
  596. if len(moneys) > 0 {
  597. if vf, ok := moneys[0].(float64); ok {
  598. blockPackage[k].Budget = vf
  599. blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
  600. } else if vi, ok := moneys[0].(int); ok {
  601. blockPackage[k].Budget = float64(vi)
  602. blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
  603. }
  604. }
  605. } else if kc == "中标金额" && v.Bidamount <= 0 {
  606. moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
  607. if len(moneys) > 0 {
  608. if vf, ok := moneys[0].(float64); ok {
  609. blockPackage[k].Bidamount = vf
  610. blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
  611. } else if vi, ok := moneys[0].(int); ok {
  612. blockPackage[k].Bidamount = float64(vi)
  613. blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
  614. }
  615. }
  616. } else if kc == "中标单位" && v.Winner == "" {
  617. blockPackage[k].Winner = cv[0].Value
  618. }
  619. }
  620. }
  621. }
  622. //从正文里面找分包
  623. func FindPackageFromText(title string, content string, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
  624. blockPackage = map[string]*util.BlockPackage{}
  625. //从正文里面找分包
  626. divisionPackageChild(&blockPackage, content, title, true, false, isSite, codeSite)
  627. for k, v := range blockPackage {
  628. findWinnerBugetBidmountByKv(v, blockPackage, k)
  629. }
  630. //winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
  631. return
  632. }
  633. //分块之后分包
  634. func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool, isSite bool, codeSite string) (bool, string) {
  635. //查找知否有分包
  636. content = regFJWarap.ReplaceAllString(content, "\n")
  637. content = regAZWarap.ReplaceAllString(content, "\n")
  638. content = regStrWrap.ReplaceAllString(content, "\n")
  639. content = regMoreWrap.ReplaceAllString(content, "\n")
  640. content = regEndWrap.ReplaceAllString(content, "")
  641. content = regBZJWarap.ReplaceAllString(content, "")
  642. con, pkg, flag := CheckMultiPackage(content, title)
  643. if !flag {
  644. return false, ""
  645. }
  646. // util.Debug(con)
  647. // util.Debug(pkg)
  648. //分包前面添加换行
  649. appendWarpIndex := []int{}
  650. for _, v := range pkg {
  651. //如果文本内容以识别出来的分包标识结尾,不是分包
  652. if len(pkg) == 1 && strings.HasSuffix(con, v[0]) {
  653. return false, ""
  654. }
  655. is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
  656. for _, sv := range is {
  657. appendWarpIndex = append(appendWarpIndex, sv[0])
  658. }
  659. }
  660. appendWarpIndex = getPkgIndex(appendWarpIndex)
  661. conTemp := ""
  662. for k, v := range appendWarpIndex {
  663. if k == 0 {
  664. conTemp += con[:v] + "\n"
  665. } else {
  666. conTemp += "\n" + con[appendWarpIndex[k-1]:v]
  667. }
  668. if k == len(appendWarpIndex)-1 {
  669. conTemp += "\n" + con[v:]
  670. }
  671. }
  672. con = conTemp
  673. con = replSerial.ReplaceAllString(con, "\n")
  674. con = regMoreWrap.ReplaceAllString(con, "\n")
  675. //根据分包,找索引位置
  676. indexMap := map[int]int{}
  677. indexKeyStringMap := map[int]string{}
  678. indexKeyIntMap := map[int]int{}
  679. indexs := []int{}
  680. startEndMap := map[int]int{}
  681. pkgIndexMap := map[string][]int{}
  682. indexPkgMap := map[int]string{}
  683. //小标题
  684. titleindexs := indexTile.FindAllStringIndex(con, -1)
  685. if len(titleindexs) == 0 {
  686. titleindexs = indexTile2.FindAllStringIndex(con, -1)
  687. }
  688. //遍历分包,把kv在包前面的移动到包后面
  689. for _, v := range pkg {
  690. pgflag := v[0] + "[::]*"
  691. is := regexp.MustCompile(pgflag).FindAllStringIndex(con, -1)
  692. for _, sv := range is {
  693. indexMap[sv[0]] = sv[1]
  694. indexs = append(indexs, sv[0])
  695. pkgIndexMap[v[0]] = append(pkgIndexMap[v[0]], sv[0])
  696. indexPkgMap[sv[0]] = v[0]
  697. }
  698. //key在包前面,并且在一行的开头
  699. keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
  700. if len(keys) == 0 {
  701. //key在包前面,并且key以冒号结尾
  702. keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
  703. }
  704. if len(keys) == 0 {
  705. keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
  706. }
  707. for _, key := range keys {
  708. startEndMap[key[5]] = key[4]
  709. //
  710. headkey := con[key[4]:key[5]]
  711. headkey = regReplAllSpace.ReplaceAllString(headkey, "")
  712. if !regDivision.MatchString(headkey) {
  713. headkey += ":"
  714. }
  715. headkey = moreColonReg.ReplaceAllString(headkey, ":")
  716. colonIndexs := regDivision.FindAllStringIndex(headkey, -1)
  717. if len(colonIndexs) > 1 {
  718. headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]]
  719. }
  720. indexKeyStringMap[key[5]] = headkey
  721. indexKeyIntMap[key[5]] = key[1]
  722. }
  723. }
  724. indexs = getPkgIndex(indexs)
  725. for ik, iv := range indexs {
  726. if indexKeyStringMap[iv] != "" {
  727. continue
  728. }
  729. if indexKeyIntMap[iv] == indexMap[iv] {
  730. continue
  731. }
  732. if ik > 0 {
  733. indexKeyStringMap[iv] = indexKeyStringMap[indexs[ik-1]]
  734. }
  735. }
  736. //获取截取标识
  737. surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con)
  738. //查找分包内容,分kv
  739. for _, iv := range indexs {
  740. text := indexTextMap[iv]
  741. tmptext := text
  742. //
  743. warpIndex := regSpliteSegment.FindAllStringIndex(text, -1)
  744. if len(indexWarpMap) > 0 {
  745. maxWarpCount = indexWarpMap[iv]
  746. }
  747. if maxWarpCount > 0 && len(warpIndex) >= 5 && len(warpIndex) > maxWarpCount {
  748. textTemp := text
  749. text = textTemp[:warpIndex[maxWarpCount-1][1]]
  750. surplusText += textTemp[warpIndex[maxWarpCount-1][0]:]
  751. }
  752. for bk, bv := range pkg {
  753. //判断分包如果在这段文字里面,该段文字就属于该包的
  754. if !strings.HasPrefix(text, bv[0]) {
  755. continue
  756. }
  757. index := util.PackageNumberConvert(bk)
  758. //去掉前缀,空格必须要加,分kv的时候要用
  759. text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
  760. if strings.TrimLeft(tmptext, bv[0]) == text || strings.TrimLeft(tmptext, bv[0]+":") == text || strings.TrimLeft(tmptext, bv[0]+":") == text {
  761. var tagtitle string
  762. for i, v := range titleindexs {
  763. if i == 0 {
  764. continue
  765. }
  766. if v[0] > iv {
  767. tagtitle = con[titleindexs[i-1][0]:titleindexs[i-1][1]]
  768. break
  769. }
  770. }
  771. tagtitle = regReplAllSpace2.ReplaceAllString(tagtitle, "")
  772. if tagtitle == "" {
  773. tagtitle = title
  774. } else if strings.Contains(tagtitle, bv[0]) && title != "" {
  775. tagtitle = title
  776. }
  777. text = tagtitle + ":" + text
  778. }
  779. headKey := ""
  780. if indexKeyStringMap[iv] != "" {
  781. //if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
  782. headKey = indexKeyStringMap[iv]
  783. text = indexKeyStringMap[iv] + " " + text
  784. //}
  785. for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] {
  786. delete(indexKeyStringMap, pkgIndexMap_v)
  787. break
  788. }
  789. }
  790. //如果一块中有多个相同的包,合并到一个
  791. if (*blockPackage)[index] != nil {
  792. //合并文本
  793. (*blockPackage)[index].Text += "\n" + text
  794. //合并冒号kv
  795. colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1, isSite, codeSite)
  796. if headKey != "" {
  797. kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
  798. MergeKvTags(colonJobKv.KvTags, kvAgain.KvTags)
  799. }
  800. MergeKvTags((*blockPackage)[index].ColonKV.KvTags, colonJobKv.KvTags)
  801. //合并空格kv
  802. spaceJobKv := SspacekvEntity.Entrance(text, headKey, nil, isSite, codeSite)
  803. MergeKvTags((*blockPackage)[index].SpaceKV.KvTags, spaceJobKv.KvTags)
  804. } else {
  805. newBpkg := &util.BlockPackage{
  806. Origin: bk,
  807. Text: text,
  808. Index: index,
  809. Name: bv[0],
  810. Type: bv[1],
  811. Accuracy: accuracy,
  812. }
  813. //fmt.Println(text)
  814. finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4, isSite, codeSite)
  815. if headKey != "" {
  816. kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
  817. MergeKvTags(finalKv.KvTags, kvAgain.KvTags)
  818. }
  819. newBpkg.ColonKV = finalKv
  820. newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil, isSite, codeSite)
  821. (*blockPackage)[index] = newBpkg
  822. }
  823. }
  824. }
  825. //中标人排序
  826. //if isFindWinnerOrder && blockPackage != nil && len(*blockPackage) > 0 {
  827. // for _, v := range *blockPackage {
  828. // v.WinnerOrder = winnerOrderEntity.Find(v.Text, true, 2, isSite, codeSite)
  829. // }
  830. //}
  831. return true, surplusText
  832. }
  833. func getPkgIndex(indexs []int) []int {
  834. sort.Ints(indexs)
  835. indexsNew := []int{}
  836. count := 0
  837. for k, v := range indexs {
  838. if k > 0 && v-indexs[k-1] <= 10 {
  839. count++
  840. continue
  841. }
  842. indexsNew = append(indexsNew, v)
  843. }
  844. if count > 0 && count == len(indexs)-1 {
  845. return []int{}
  846. }
  847. return indexsNew
  848. }
  849. //每个包对应的结束位置,都是整行结束
  850. func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[string][]int, startEndMap map[int]int, con string) (string, int, map[int]string, map[int]int) {
  851. //util.Debug(con)
  852. surplusText := ""
  853. indexTextMap := map[int]string{}
  854. indexWarpMap := map[int]int{}
  855. maxWarpCount := 0
  856. for ik, iv := range indexs {
  857. text := ""
  858. if ik < len(indexs)-1 {
  859. if startEndMap[indexs[ik+1]] != 0 {
  860. text = con[iv:startEndMap[indexs[ik+1]]]
  861. } else {
  862. text = con[iv:indexs[ik+1]]
  863. }
  864. } else {
  865. text = con[iv:]
  866. }
  867. //fmt.Println(text)
  868. tmptext := text
  869. //if strings.Contains(text, "、") {
  870. // text = strings.Split(text, "、")[0]
  871. //} else
  872. if strings.Contains(text, "\n") {
  873. texts := strings.Split(text, "\n")
  874. text2 := ""
  875. if ik+1 < len(indexs)-1 {
  876. if startEndMap[indexs[ik+1+1]] != 0 {
  877. text2 = con[startEndMap[indexs[ik+1]]:startEndMap[indexs[ik+1+1]]]
  878. } else {
  879. text2 = con[indexs[ik+1]:indexs[ik+1+1]]
  880. }
  881. if texts[len(texts)-1] == text2 {
  882. text = texts[0]
  883. }
  884. }
  885. }
  886. if utf8.RuneCountInString(text) < 5 {
  887. indexTextMap[iv] = tmptext
  888. } else {
  889. indexTextMap[iv] = text
  890. }
  891. warpCount := len(regSpliteSegment.FindAllStringIndex(text, -1))
  892. if warpCount > maxWarpCount {
  893. maxWarpCount = warpCount
  894. }
  895. indexWarpMap[iv] = warpCount
  896. if ik == 0 {
  897. surplusText += con[:iv]
  898. }
  899. }
  900. pkgLaw := ""
  901. if len(pkgIndexMap) > 1 {
  902. //有规律的出现 AB or ABAB
  903. if pkgLaw == "" {
  904. prevVal := ""
  905. notRepeatCount, currentIndex, onceMax, allMax := 0, -1, 0, 0
  906. indexMaxMap := map[int]int{}
  907. for ik, iv := range indexs {
  908. if notRepeatCount == len(pkgIndexMap) {
  909. notRepeatCount = 0
  910. }
  911. if prevVal != indexPkgMap[iv] {
  912. notRepeatCount++
  913. } else {
  914. notRepeatCount = -1
  915. currentIndex = ik
  916. break
  917. }
  918. prevVal = indexPkgMap[iv]
  919. if notRepeatCount == len(pkgIndexMap) {
  920. indexMaxMap[iv] = onceMax
  921. onceMax = 0
  922. }
  923. if indexWarpMap[iv] > onceMax {
  924. onceMax = indexWarpMap[iv]
  925. allMax = onceMax
  926. }
  927. if ik == len(indexs)-1 && notRepeatCount != len(pkgIndexMap) {
  928. notRepeatCount = -2
  929. currentIndex = ik
  930. }
  931. }
  932. //util.Debug(allMax, currentIndex, indexWarpMap, indexMaxMap)
  933. if len(indexMaxMap) > 0 {
  934. pkgLaw = "AB"
  935. thisMax := 0
  936. for ik := len(indexs) - 1; ik >= 0; ik-- {
  937. iv := indexs[ik]
  938. if currentIndex != -1 && ik >= currentIndex {
  939. indexWarpMap[iv] = allMax
  940. continue
  941. }
  942. if indexMaxMap[iv] > 0 {
  943. thisMax = indexMaxMap[iv]
  944. }
  945. indexWarpMap[iv] = thisMax
  946. }
  947. }
  948. }
  949. }
  950. if pkgLaw == "" {
  951. indexWarpMap = map[int]int{}
  952. }
  953. //util.Debug(pkgLaw, maxWarpCount, indexTextMap, indexWarpMap)
  954. return surplusText, maxWarpCount, indexTextMap, indexWarpMap
  955. }
  956. //分块之后的kv
  957. func kvAfterDivideBlock(tp, text string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) []*util.Kv {
  958. blocks, _ := DivideBlock(tp, text, from, ruleBlock, isSite, codeSite)
  959. kvs := []*util.Kv{}
  960. for _, v := range blocks {
  961. //util.Debug(v.Text)
  962. // for _, vvv := range v.ColonKV.Kvs {
  963. // util.Debug(vvv.Key, vvv.Value, vvv.Title)
  964. // }
  965. kvs = append(kvs, v.ColonKV.Kvs...)
  966. }
  967. return kvs
  968. }