division.go 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146
  1. package pretreated
  2. import (
  3. "fmt"
  4. "jy/clear"
  5. "jy/util"
  6. qutil "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "unicode/utf8"
  12. )
  13. //分块、分段功能
  14. var (
  15. /*regSerialTitles = []string{
  16. "([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)",
  17. "[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)",
  18. "(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)",
  19. "(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)",
  20. "(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
  21. "1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)",
  22. }*/
  23. regSerialTitles_1 = []*regexp.Regexp{
  24. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)"),
  25. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)"),
  26. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
  27. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)"),
  28. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)"),
  29. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)"),
  30. regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s(]*|^[\u3000\u2003\u00a0\\s(]*)(\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)"),
  31. }
  32. regSerialTitles_2 = []*regexp.Regexp{
  33. regexp.MustCompile("^([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)$"),
  34. regexp.MustCompile("^[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)$"),
  35. regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)$"),
  36. regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)$"),
  37. regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
  38. regexp.MustCompile("^1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)$"),
  39. regexp.MustCompile("^[(](\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)$"),
  40. }
  41. regReplAllTd = regexp.MustCompile("(?smi)<td.*?>.+?</td>")
  42. regIsNumber = regexp.MustCompile("^\\d+$")
  43. regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
  44. regReplAllSpace = regexp.MustCompile("[\u3000\u2003\u00a0\\s]+")
  45. regTrimSpace = regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
  46. regReplWrapSpace = regexp.MustCompile("^[\r\n][\u3000\u2003\u00a0\\s]*|[\r\n][\u3000\u2003\u00a0\\s]*$")
  47. regReplAllSymbol = regexp.MustCompile("[(\\(<《【\\[{{〔)\\)>》】\\]}}〕,,;;::'\"“”。.\\??/+=\\-_——*&……\\^%$¥@#!!`~·]")
  48. regFilterTitle = regexp.MustCompile("[(\\(<《【\\[{{〔].+?[)\\)>》】\\]}}〕]")
  49. regDivision = regexp.MustCompile("[::]")
  50. regSpliteSegment = regexp.MustCompile("[\r\n]")
  51. regFilterNumber = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
  52. regSplit = regexp.MustCompile("或|和|以?及|与|、|或")
  53. regStartWrap = regexp.MustCompile("^[\r\n]")
  54. regEndWrap = regexp.MustCompile("[\r\n]$")
  55. regMoreWrap = regexp.MustCompile("[\r\n]{2,}")
  56. regStrWrap = regexp.MustCompile("分包名称[::]")
  57. regBZJWarap = regexp.MustCompile("(每标段|保证金.*|标示|标[\\d一二三四五六七八九十]+室|型号[::]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]+标室|[\\d一二三四五六七八九十]包密封|(^一包|商务|资格|价格标(每包内含相应文件正副本))|[未|不]+划分标段)")
  58. regFJWarap = regexp.MustCompile("[a-zA-Z0-9](包|标段).*.(pdf|PDF|docx|doc|DOCX|DOC|swf|SWF)")
  59. regAZWarap = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张\\/包|纸[\\d]*包|\\*[\\d]+包|相机包)")
  60. replSerial = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
  61. moreColonReg = regexp.MustCompile("[::]+")
  62. regFilter = regexp.MustCompile("等$")
  63. pkgFilter = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?[标|包]+[段|号]+")
  64. indexTile = regexp.MustCompile(`[0-9.]{2,3}[^包标段][\s\\u4e00-\\u9fa5]{2,8}[::]+`) //小标题
  65. indexTile2 = regexp.MustCompile(`[\s\\u4e00-\\u9fa5]{2,8}[::]\n`)
  66. regReplAllSpace2 = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
  67. confusion = map[string]string{
  68. "参与": "canyu",
  69. }
  70. //查找分包之前,先对内容进行预处理
  71. /*
  72. 第一包:采购设备清单
  73. <table></table>
  74. */
  75. regPackageFilter = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
  76. filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
  77. xuhao = map[string]bool{
  78. "19968_12289": true,
  79. "19968_46": true,
  80. "20108_12289": true,
  81. "20108_46": true,
  82. "19977_12289": true,
  83. "19977_46": true,
  84. "22235_12289": true,
  85. "22235_46": true,
  86. "20116_12289": true,
  87. "20116_46": true,
  88. "20845_12289": true,
  89. "20845_46": true,
  90. "19971_12289": true,
  91. "19971_46": true,
  92. "20843_12289": true,
  93. "20061_46": true,
  94. }
  95. //非分包中标单位值
  96. unPackageWinnerReg = regexp.MustCompile("(重新招标|方案包)")
  97. conformWinnerKVReg = regexp.MustCompile("^(中标人|中标银行|第一名)[::](.{4,20}(分行|公司))")
  98. conformWinnerKVReg1 = regexp.MustCompile("^[-].{4,15}公司$")
  99. conformWinnerKVReg2 = regexp.MustCompile("(.*)?确定(.*公司)为中标人(.*)?")
  100. conformWinnerTextReg3 = regexp.MustCompile("拟定供应商信息[::\\s]+名称[::](.*)[\\s]+地址")
  101. /*
  102. 拟定供应商信息:
  103. 名称:郑州人民广播电台
  104. 地址:郑州市金水区内环路17号A座。
  105. */
  106. //针对处理-替换敏感词-中标
  107. packageReg1 = regexp.MustCompile("(包件[一二三四五1-9][::].*)\n1[、.\\s]+名称[::](.*)\n2[、.\\s]+")
  108. packageReg2 = regexp.MustCompile("标段[((]包[))][\\[][O0]+([1-9一二三四五六七八九])[\\]]")
  109. packageReg3 = regexp.MustCompile("(中标价格)[::]")
  110. packageReg4 = regexp.MustCompile("([1-9](标段)[::])拟定供应商名称[::](.*公司)\n")
  111. packageReg5 = regexp.MustCompile("(第[1-9一二三四五](标段))(中标人)[::](.*)\n")
  112. packageReg6 = regexp.MustCompile("供应商名称[::](.{4,20}公司)[((]([0]?1包)[))][、,,](.{4,20}公司)[((]([0]?2包)[))]")
  113. //预算
  114. packageReg20 = regexp.MustCompile("(最高投标限价为|投资预算约[为]?)([0-9.万元人民币]+)")
  115. packageReg21 = regexp.MustCompile("(预算金额|项目预算)[::](包[\\s]?1|1[\\s]?包)[::]?([0-9.万元人民币]+)[,,](包[\\s]?2|2[\\s]?包)[::]?([0-9.万元人民币]+)")
  116. untitleReg = regexp.MustCompile("(技术评分明细表)")
  117. unpriceReg = regexp.MustCompile("(^([Xx]\\+[1-9\\.]+元/每)|分析)")
  118. //敏感词-影响分包-替换-分割
  119. replaceSenstiveReg1 = regexp.MustCompile("([一二三四五六七八九十1-9][、]项目名称[::].*采购项目)([一二三四五六七八九十1-9][、]采购结果)")
  120. //价格~单位换行 替换
  121. packageReg50 = regexp.MustCompile("(投标报价[::][0-9.]+)\n(万元)")
  122. )
  123. //分块
  124. func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) ([]*util.Block, int) {
  125. defer qutil.Catch()
  126. returnValue := 0
  127. var blocks []*util.Block
  128. if strings.TrimSpace(content) == "" || codeSite == "a_zgyc_ztbxx" || codeSite=="a_gyzbgfyxgs_zbjg" {
  129. return blocks, -1
  130. }
  131. //table里面的内容不考虑,先把table清理掉
  132. //contentTemp := regReplAllTd.ReplaceAllString(content, "")
  133. contentTemp := TextAfterRemoveTable(content)
  134. tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
  135. var regContenSerialTitle *regexp.Regexp
  136. var regSerialTitleIndex int
  137. if ruleBlock != nil && len(ruleBlock.BlockRegs) > 0 {
  138. regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, ruleBlock.BlockRegs)
  139. } else {
  140. regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, regSerialTitles_1)
  141. }
  142. //没有分块
  143. if regSerialTitleIndex == -1 {
  144. if len(contentTemp) == len(content) {
  145. //没有分块
  146. return blocks, -1
  147. } else { //有table
  148. return blocks, -2
  149. }
  150. }
  151. //匹配序号和标题
  152. var regSerialTitle *regexp.Regexp
  153. if ruleBlock != nil && len(ruleBlock.TitleRegs) > 0 {
  154. regSerialTitle = ruleBlock.TitleRegs[regSerialTitleIndex]
  155. } else {
  156. regSerialTitle = regSerialTitles_2[regSerialTitleIndex]
  157. }
  158. indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
  159. indexs = filterSerial(content, indexs, tdIndexs)
  160. //头块
  161. var headBlock, endBlock *util.Block
  162. currentIndex := 0
  163. for k, v := range indexs {
  164. start, end := v[0], v[1]
  165. //添加开头部分
  166. if k == 0 {
  167. if headTemp := content[:start]; regReplAllSpace.ReplaceAllString(headTemp, "") != "" {
  168. headBlock = &util.Block{
  169. Index: -1, //序号
  170. Text: headTemp, //内容
  171. Title: "", //标题
  172. Start: 0,
  173. End: start,
  174. }
  175. }
  176. }
  177. //分块
  178. blockSerialTitle := regTrimSpace.ReplaceAllString(content[start:end], "")
  179. serialTitles := regSerialTitle.FindStringSubmatch(blockSerialTitle) //序号和标题
  180. if len(serialTitles) < 3 {
  181. continue
  182. }
  183. indexSting := regReplAllSpace.ReplaceAllString(serialTitles[1], "") //序号
  184. index := 0
  185. //转成数字序号
  186. if regIsNumber.MatchString(indexSting) {
  187. index, _ = strconv.Atoi(indexSting)
  188. } else if regIsChineseNumber.MatchString(indexSting) {
  189. index = util.ChineseNumberToInt(indexSting)
  190. }
  191. //序号开始就是错误的
  192. if k+1 != index {
  193. if k == 0 {
  194. returnValue = 3
  195. break
  196. } else {
  197. if currentIndex+1 != index {
  198. //如果序号不是连续的,不往下走
  199. returnValue = 2
  200. //添加结尾部分
  201. if from != 3 {
  202. endBlock = &util.Block{
  203. Index: -2, //序号
  204. Text: content[start:], //内容
  205. Title: "", //标题
  206. Start: start,
  207. End: len(content),
  208. }
  209. break
  210. }
  211. }
  212. }
  213. currentIndex = index
  214. }
  215. //
  216. title := serialTitles[2] //标题
  217. title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格
  218. //分块后的块文
  219. nextStart := len(content)
  220. if k < len(indexs)-1 {
  221. nextStart = indexs[k+1][0]
  222. }
  223. //获取块中除了序号和标题的内容
  224. blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
  225. if title != "" {
  226. blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
  227. //特殊情况处理
  228. if blockTextTemp == "" {
  229. if regDivision.MatchString(title) {
  230. /*
  231. 一、项目编号:HMEC170223
  232. 二、项目名称:执法记录仪采购
  233. */
  234. blockText = title
  235. divisionIndexs := regDivision.FindStringIndex(title)
  236. title = title[:divisionIndexs[0]]
  237. } else {
  238. /*
  239. 十一、投标代表须持本人身份证原件亲自递交投标文件,代理机构项目经理审核通过后,办理签收手续,否则投标文件被拒收。
  240. 十二、开标时间:2017年3月20日9时30分
  241. */
  242. blockText = title
  243. title = ""
  244. }
  245. } else if blockTextTemp != "" && regDivision.MatchString(title) {
  246. /*
  247. 2、采购单位名称:福建省汀州医院
  248. 采购单位地址: 龙岩市长汀县
  249. 联系人:胡科长
  250. 联系方式:0597-6826353
  251. */
  252. //多个标题
  253. divisionIndexs := regDivision.FindStringIndex(title)
  254. titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
  255. titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
  256. blockText = title + "\n" + blockText
  257. if titleAfter != "" {
  258. title = ""
  259. } else {
  260. title = titleBefore
  261. }
  262. } else {
  263. blockText = title + "\n" + blockText
  264. }
  265. }
  266. //没有内容的块,不打标签,不分段
  267. if blockText == "" {
  268. continue
  269. }
  270. //过滤
  271. if regexp.MustCompile("投标文件格式|业绩").MatchString(title) &&
  272. !regexp.MustCompile("拟定的唯一供应商名称").MatchString(title){
  273. continue
  274. }
  275. blockText = hasMergeKV(title, blockText)
  276. //
  277. titleIsExists := map[string]bool{} //去重
  278. title = filterTitle(title)
  279. //分割标题 [和及]。。。 参与
  280. splitTitles := ProcTitle(title)
  281. blockText = mergetext(splitTitles, blockText)
  282. block := &util.Block{
  283. Index: index, //序号
  284. Text: blockText, //内容
  285. Title: title, //标题
  286. Titles: splitTitles,
  287. Start: start,
  288. End: nextStart,
  289. }
  290. titles := []string{}
  291. for _, sv := range splitTitles {
  292. if sv == "" || titleIsExists[sv] {
  293. continue
  294. }
  295. titleIsExists[sv] = true
  296. //标题过短过长不打标签
  297. if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
  298. //打标签
  299. block.Tags = append(block.Tags, util.GetBlockTags(sv))
  300. titles = append(titles, sv)
  301. }
  302. }
  303. block.Title = title
  304. block.Titles = titles
  305. if ruleBlock != nil {
  306. block.Classify, block.NotClassifyTitles = ruleBlock.Classify.GetClassify(tp, titles)
  307. }
  308. tagsToBlocks(blocks, block)
  309. //log.Println(index, sv, splitTitles)
  310. //log.Println(blockText)
  311. blocks = append(blocks, block)
  312. }
  313. var returnBlocks []*util.Block
  314. if len(blocks) > 0 {
  315. //头
  316. if headBlock != nil {
  317. if tp == "招标" {
  318. headBlock.Classify = map[string]bool{"bidcondition": true}
  319. }
  320. returnBlocks = append(returnBlocks, headBlock)
  321. }
  322. //中间块
  323. returnBlocks = append(returnBlocks, blocks...)
  324. //尾
  325. if endBlock != nil {
  326. returnBlocks = append(returnBlocks, endBlock)
  327. }
  328. if returnValue == 0 {
  329. returnValue = 1
  330. }
  331. }
  332. contactFormat := &util.ContactFormat{
  333. IndexMap: map[int]string{},
  334. MatchMap: map[string]map[string]bool{},
  335. }
  336. for _, bl := range returnBlocks {
  337. //解析kv
  338. newText := TextAfterRemoveTable(bl.Text) //取出纯文本
  339. bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from, isSite, codeSite)
  340. bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat, isSite, codeSite)
  341. //正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号
  342. bl.Text = appendWarpStop(bl.Text)
  343. }
  344. return returnBlocks, returnValue
  345. }
  346. func mergetext(titles []string, text string) string {
  347. if len(titles) == 0 || utf8.RuneCountInString(text) > 150 {
  348. return text
  349. }
  350. splitLenstrs := strings.Split(text, "\n")
  351. if len(splitLenstrs) == 1 || len(titles) != len(splitLenstrs)-1 {
  352. return text
  353. }
  354. tt := ""
  355. for i, v := range splitLenstrs[1:] {
  356. lentexts := regDivision.Split(v, -1)
  357. if len(lentexts) == 2 {
  358. if strings.Contains(titles[i], lentexts[0]) {
  359. tt += titles[i] + ":" + lentexts[1] + "\n"
  360. }else if strings.Contains(strings.ReplaceAll(titles[i],"的",""), strings.ReplaceAll(lentexts[0],"的","")){
  361. tt += titles[i] + ":" + lentexts[1] + "\n"
  362. }else if strings.Contains(strings.ReplaceAll(titles[i],"联系地址","地址"), strings.ReplaceAll(lentexts[0],"联系地址","地址")){
  363. tt += titles[i] + ":" + lentexts[1] + "\n"
  364. }
  365. }else {
  366. //特殊处理
  367. if strings.Contains(v,"中标人 ") {
  368. tt +=v+"\n"
  369. }
  370. }
  371. }
  372. if len(tt) == 0 {
  373. return text
  374. } else {
  375. return tt
  376. }
  377. }
  378. //块标题处理
  379. func ProcTitle(title string) []string {
  380. if title == "" {
  381. return []string{}
  382. }
  383. for k, v := range confusion {
  384. title = strings.Replace(title, k, v, -1)
  385. }
  386. direct := 1
  387. prev := ""
  388. ara := regSplit.Split(title, -1)
  389. for kk, vv := range ara {
  390. for kkk, vvv := range confusion {
  391. vv = strings.Replace(vv, vvv, kkk, -1)
  392. }
  393. ara[kk] = vv
  394. if len([]rune(vv)) == 2 {
  395. if kk == 0 {
  396. direct = -1
  397. } else {
  398. start := ""
  399. if len([]rune(prev)) > 3 {
  400. start = string([]rune(prev)[:len([]rune(prev))-2])
  401. }
  402. ara[kk] = start + vv
  403. }
  404. } else if vv == "联系人" || vv == "联系方式" {
  405. if strings.Contains(prev, "代理") {
  406. ara[kk] = "代理机构" + vv
  407. } else if strings.Contains(prev, "中标") {
  408. ara[kk] = "中标单位" + vv
  409. } else if strings.Contains(prev, "采购") {
  410. ara[kk] = "采购单位" + vv
  411. }
  412. }
  413. if len([]rune(vv)) > 3 {
  414. if direct == -1 {
  415. end := string([]rune(vv)[len([]rune(vv))-2:])
  416. for i := 0; i < kk; i++ {
  417. ara[i] = ara[i] + end
  418. }
  419. break
  420. }
  421. prev = vv
  422. }
  423. }
  424. return ara
  425. }
  426. //有合并kv的 例如项目名称及编号
  427. func hasMergeKV(title, text string) string {
  428. title = regDivision.ReplaceAllString(title, "")
  429. titles := regSplit.Split(title, -1)
  430. if len(titles) <= 1 {
  431. return text
  432. }
  433. before := titles[0]
  434. after := titles[1]
  435. if strings.Contains(title, "项目") && len([]rune(after)) == 2 {
  436. after = "项目" + after
  437. } else {
  438. return text
  439. }
  440. if strings.Count(text, "\n") != 1 {
  441. return text
  442. }
  443. texts := strings.Split(text, "\n")
  444. textOneLine := texts[0]
  445. textTwoLine := texts[1]
  446. if regDivision.MatchString(textTwoLine) {
  447. return text
  448. }
  449. if textTwoLine := strings.SplitN(textTwoLine, ",", 2); len(textTwoLine) == 2 {
  450. text = textOneLine + "\n" + before + ":" + textTwoLine[0] + "," + after + ":" + textTwoLine[1]
  451. }
  452. return text
  453. }
  454. //过滤序号,判断序号是不是在td里,如果是的话这个序号作废
  455. func filterSerial(content string, indexs, tdIndexs [][]int) [][]int {
  456. returnIndexs := [][]int{}
  457. for _, v := range indexs {
  458. flag := false
  459. //根据序号的开始位置,判断是不是在td里面
  460. for _, tv := range tdIndexs {
  461. if v[0] > tv[0] && v[0] < tv[1] {
  462. flag = true
  463. continue
  464. }
  465. }
  466. if flag {
  467. continue
  468. }
  469. returnIndexs = append(returnIndexs, []int{v[0], v[1]})
  470. }
  471. return returnIndexs
  472. }
  473. //获取正文所用的序号类型
  474. func getSerialType(content string, blockRegs []*regexp.Regexp) (*regexp.Regexp, int) {
  475. var regContenSerialTitle *regexp.Regexp
  476. //先判断文章最外层使用的是哪种序号
  477. contentStartIndex, regSerialTitleIndex := -1, -1
  478. for k, v := range blockRegs {
  479. indexs := v.FindStringIndex(content)
  480. //只用最外层的序号,里面的过滤掉
  481. if len(indexs) == 2 && !strings.Contains(content,"中标候选人排序") && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) {
  482. regSerialTitleIndex = k
  483. contentStartIndex = indexs[0]
  484. regContenSerialTitle = v
  485. }
  486. }
  487. return regContenSerialTitle, regSerialTitleIndex
  488. }
  489. //添加换行和句号
  490. func appendWarpStop(text string) string {
  491. //清理前后空格
  492. text = regTrimSpace.ReplaceAllString(text, "")
  493. //添加句号
  494. if !strings.HasSuffix(text, "。") {
  495. text += "。"
  496. }
  497. //添加换行
  498. if !regEndWrap.MatchString(text) {
  499. text += "\n"
  500. }
  501. return text
  502. }
  503. //分段
  504. func DivideSegmentHtml(txt string) []*util.Segment {
  505. //先分段
  506. _segs := strings.FieldsFunc(txt, func(r rune) bool {
  507. return r == 10 || r == 13
  508. })
  509. //再去除空行
  510. segs := make([]*util.Segment, 0)
  511. _index := 0
  512. for _, seg := range _segs {
  513. if seg != " " && len(seg) > 1 {
  514. _seg := util.Segment{}
  515. _index = _index + 1
  516. _seg.Index = _index
  517. _seg.Text = seg
  518. segs = append(segs, &_seg)
  519. }
  520. }
  521. return segs
  522. }
  523. //分段
  524. func DivideSegment(txt string) []*util.Segment {
  525. //先分段
  526. tmpstr := ""
  527. _segs := strings.FieldsFunc(txt, func(r rune) bool {
  528. if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
  529. r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
  530. if tmpstr == "" {
  531. tmpstr += fmt.Sprint(r)
  532. return false
  533. } else if strings.Contains(tmpstr, "_") {
  534. tmpstr = ""
  535. tmpstr += fmt.Sprint(r)
  536. return false
  537. } else if tmpstr == fmt.Sprint(r) {
  538. if r == 46 || r == 12289 {
  539. tmpstr = ""
  540. }
  541. return false
  542. }
  543. tmpstr += "_" + fmt.Sprint(r)
  544. if xuhao[tmpstr] {
  545. return true
  546. }
  547. }
  548. tmpstr = ""
  549. return r == 10 || r == 13
  550. })
  551. //再去除空行
  552. segs := make([]*util.Segment, 0)
  553. _index := 0
  554. for _, seg := range _segs {
  555. if seg != " " && len(seg) > 1 {
  556. _seg := util.Segment{}
  557. _index = _index + 1
  558. _seg.Index = _index
  559. _seg.Text = seg
  560. segs = append(segs, &_seg)
  561. }
  562. }
  563. return segs
  564. }
  565. /** 给块打标签 **/
  566. func tagsToBlocks(blocks []*util.Block, block *util.Block) {
  567. if len(block.Tags) == 0 {
  568. return
  569. }
  570. tag := map[string]bool{}
  571. tagWeight := map[string]int{}
  572. for _, v := range block.Tags {
  573. for _, ts := range v {
  574. tag[ts.Value] = true
  575. tagWeight[ts.Value] = ts.Weight
  576. }
  577. }
  578. for v, _ := range tag {
  579. for _, block := range blocks {
  580. if block.Tag[v] {
  581. for _, blockTags := range block.Tags {
  582. for _, ts := range blockTags {
  583. if ts.Value == v && ts.Weight < tagWeight[v] {
  584. block.Tag[v] = false
  585. }
  586. }
  587. }
  588. }
  589. }
  590. }
  591. block.Tag = tag
  592. }
  593. func filterTitle(title string) string {
  594. if strings.Contains(title, ",") && strings.Contains(title, "。") {
  595. return ""
  596. }
  597. if len([]rune(title)) > 30 {
  598. return ""
  599. }
  600. //清理空格
  601. title = regReplAllSpace.ReplaceAllString(title, "")
  602. //清理成对出现的符号中的内容
  603. title = regFilterTitle.ReplaceAllString(title, "")
  604. //清理特殊符号
  605. title = regReplAllSymbol.ReplaceAllString(title, "")
  606. //清理序号
  607. title = regFilterNumber.ReplaceAllString(title, "")
  608. title = regFilter.ReplaceAllString(title, "")
  609. return title
  610. }
  611. //从块里面找分包
  612. func FindPackageFromBlocks(blocks *[]*util.Block, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
  613. blockPackage = map[string]*util.BlockPackage{}
  614. //块分包
  615. for _, v := range *blocks {
  616. text := regPackageFilter.ReplaceAllString(v.Text, "<table>")
  617. text = TextAfterRemoveTable(text)
  618. if text == "" {
  619. continue
  620. }
  621. //var ok bool
  622. //var surplusText string
  623. //分析分包-金额,中标单位,人电话,包名,中标后选人
  624. divisionPackageChild(&blockPackage, text, v.Title, true, v.Tag["中标单位"], isSite, codeSite)
  625. }
  626. //orderwinner := winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
  627. for k, v := range blockPackage {
  628. findWinnerBugetBidmountByKv(v, blockPackage, k) //根据kv-find字段
  629. }
  630. return
  631. }
  632. func findWinnerBugetBidmountByKv(v *util.BlockPackage, blockPackage map[string]*util.BlockPackage, k string) {
  633. if v.ColonKV != nil && v.ColonKV.KvTags != nil {
  634. for kc, cv := range v.ColonKV.KvTags {
  635. if kc == "预算" && v.Budget <= 0 {
  636. moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
  637. if len(moneys) > 0 {
  638. if vf, ok := moneys[0].(float64); ok {
  639. blockPackage[k].Budget = vf
  640. blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
  641. } else if vi, ok := moneys[0].(int); ok {
  642. blockPackage[k].Budget = float64(vi)
  643. blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
  644. }
  645. }
  646. } else if (kc == "中标金额"||kc=="各包中标/成交候选供应商及报价") && v.Bidamount <= 0 {
  647. //特殊金额类可避免
  648. if unpriceReg.MatchString(cv[0].Value) {
  649. continue
  650. }
  651. moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
  652. if len(moneys) > 0 {
  653. if vf, ok := moneys[0].(float64); ok {
  654. blockPackage[k].Bidamount = vf
  655. blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
  656. } else if vi, ok := moneys[0].(int); ok {
  657. blockPackage[k].Bidamount = float64(vi)
  658. blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
  659. }
  660. }
  661. } else if (kc == "中标单位"||kc=="第1 名"||kc=="各包中标/成交候选供应商及报价") && v.Winner == "" {
  662. if !unPackageWinnerReg.MatchString(cv[0].Value) {
  663. isW:=false
  664. if len(cv)>1 {
  665. for _,v_cv :=range cv{
  666. if v_cv.Key=="中标单位" && v_cv.Value!="" {
  667. isW = true
  668. blockPackage[k].Winner = v_cv.Value
  669. break
  670. }
  671. }
  672. }
  673. if !isW {
  674. blockPackage[k].Winner = cv[0].Value
  675. }
  676. }
  677. }else { //特殊情况-特殊处理
  678. res := conformWinnerKVReg.FindAllStringSubmatch(cv[0].Value, -1)
  679. if len(res) > 0 {
  680. text := res[0][2]
  681. if text!="" {
  682. blockPackage[k].Winner = text
  683. continue
  684. }
  685. }
  686. if kc=="中标信息" && conformWinnerKVReg1.MatchString(cv[0].Value){
  687. blockPackage[k].Winner = cv[0].Value
  688. continue
  689. }
  690. if conformWinnerKVReg2.MatchString(cv[0].Value) {
  691. blockPackage[k].Winner = conformWinnerKVReg2.ReplaceAllString(cv[0].Value,"${2}")
  692. continue
  693. }
  694. //全文找
  695. res = conformWinnerTextReg3.FindAllStringSubmatch(v.Text, -1)
  696. if len(res) > 0 {
  697. text := res[0][1]
  698. if text!="" {
  699. blockPackage[k].Winner = text
  700. continue
  701. }
  702. }
  703. }
  704. }
  705. }
  706. if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
  707. for kc, cv := range v.SpaceKV.KvTags {
  708. if kc == "预算" && v.Budget <= 0 {
  709. moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
  710. if len(moneys) > 0 {
  711. if vf, ok := moneys[0].(float64); ok {
  712. blockPackage[k].Budget = vf
  713. blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
  714. } else if vi, ok := moneys[0].(int); ok {
  715. blockPackage[k].Budget = float64(vi)
  716. blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
  717. }
  718. }
  719. } else if kc == "中标金额" && v.Bidamount <= 0 {
  720. moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
  721. if len(moneys) > 0 {
  722. if vf, ok := moneys[0].(float64); ok {
  723. blockPackage[k].Bidamount = vf
  724. blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
  725. } else if vi, ok := moneys[0].(int); ok {
  726. blockPackage[k].Bidamount = float64(vi)
  727. blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
  728. }
  729. }
  730. } else if kc == "中标单位" && v.Winner == "" {
  731. blockPackage[k].Winner = cv[0].Value
  732. }
  733. }
  734. }
  735. }
  736. //从正文里面找分包
  737. func FindPackageFromText(title string, content string, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
  738. blockPackage = map[string]*util.BlockPackage{}
  739. //从正文里面找分包
  740. divisionPackageChild(&blockPackage, content, title, true, false, isSite, codeSite)
  741. for k, v := range blockPackage {
  742. findWinnerBugetBidmountByKv(v, blockPackage, k)
  743. }
  744. //winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
  745. return
  746. }
  747. //分块之后分包
  748. func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool, isSite bool, codeSite string) (bool, string) {
  749. //查找知否有分包
  750. content = replaceSenstiveReg1.ReplaceAllString(content,"$1\n$2")
  751. content = regFJWarap.ReplaceAllString(content, "\n")
  752. content = regAZWarap.ReplaceAllString(content, "\n")
  753. content = regStrWrap.ReplaceAllString(content, "\n")
  754. content = regMoreWrap.ReplaceAllString(content, "\n")
  755. content = regEndWrap.ReplaceAllString(content, "")
  756. content = regBZJWarap.ReplaceAllString(content, "")
  757. //替换敏感词
  758. content = packageReg1.ReplaceAllString(content,"${1}\n中标单位:${2}\n")
  759. content = packageReg2.ReplaceAllString(content,"\n标段${1}:")
  760. content = packageReg3.ReplaceAllString(content,"\n${1}:")
  761. content = packageReg4.ReplaceAllString(content,"\n${1}\n中标单位:${3}\n")
  762. content = packageReg5.ReplaceAllString(content,"\n${1}\n中标单位:${4}\n")
  763. content = packageReg6.ReplaceAllString(content,"\n$2\n中标单位:$1\n$4\n中标单位:$3")
  764. //替换换行金额
  765. content = packageReg50.ReplaceAllString(content,"$1$2")
  766. content = packageReg20.ReplaceAllString(content,"\n预算金额:${2}\n")
  767. content = packageReg21.ReplaceAllString(content,"\n${2}\n预算金额:${3}\n${4}\n预算金额:${5}")
  768. //6、项目预算:1包3689028.00元,2包700000.00元。
  769. if untitleReg.MatchString(title){
  770. return false, ""
  771. }
  772. con, pkg, flag := CheckMultiPackage(content) //找pkg分包包名
  773. if !flag {
  774. return false, ""
  775. }
  776. // util.Debug(con)
  777. // util.Debug(pkg)
  778. //分包前面添加换行
  779. appendWarpIndex := []int{} //分包名,正文下标位置: 1000长 300下标
  780. for _, v := range pkg {
  781. //如果文本内容以识别出来的分包标识结尾,不是分包
  782. if len(pkg) == 1 && strings.HasSuffix(con, v[0]) {
  783. return false, ""
  784. }
  785. is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
  786. for _, sv := range is {
  787. appendWarpIndex = append(appendWarpIndex, sv[0])
  788. }
  789. }
  790. appendWarpIndex = getPkgIndex(appendWarpIndex)
  791. conTemp := ""
  792. for k, v := range appendWarpIndex {
  793. if k == 0 {
  794. conTemp += con[:v] + "\n"
  795. } else {
  796. conTemp += "\n" + con[appendWarpIndex[k-1]:v]
  797. }
  798. if k == len(appendWarpIndex)-1 {
  799. conTemp += "\n" + con[v:]
  800. }
  801. }
  802. con = conTemp
  803. con = replSerial.ReplaceAllString(con, "\n")
  804. con = regMoreWrap.ReplaceAllString(con, "\n")
  805. //根据分包,找索引位置
  806. indexMap := map[int]int{}
  807. indexKeyStringMap := map[int]string{}
  808. indexKeyIntMap := map[int]int{}
  809. indexs := []int{}
  810. startEndMap := map[int]int{}
  811. pkgIndexMap := map[string][]int{}
  812. indexPkgMap := map[int]string{}
  813. //小标题
  814. titleindexs := indexTile.FindAllStringIndex(con, -1)
  815. if len(titleindexs) == 0 {
  816. titleindexs = indexTile2.FindAllStringIndex(con, -1)
  817. }
  818. //遍历分包,把kv在包前面的移动到包后面
  819. for _, v := range pkg {
  820. pgflag := v[0] + "[::]*"
  821. is := regexp.MustCompile(pgflag).FindAllStringIndex(con, -1)
  822. for _, sv := range is {
  823. indexMap[sv[0]] = sv[1]
  824. indexs = append(indexs, sv[0])
  825. pkgIndexMap[v[0]] = append(pkgIndexMap[v[0]], sv[0])
  826. indexPkgMap[sv[0]] = v[0]
  827. }
  828. //key在包前面,并且在一行的开头
  829. keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
  830. if len(keys) == 0 {
  831. //key在包前面,并且key以冒号结尾
  832. keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
  833. }
  834. if len(keys) == 0 {
  835. keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
  836. }
  837. for _, key := range keys {
  838. startEndMap[key[5]] = key[4]
  839. //
  840. headkey := con[key[4]:key[5]]
  841. headkey = regReplAllSpace.ReplaceAllString(headkey, "")
  842. if !regDivision.MatchString(headkey) {
  843. headkey += ":"
  844. }
  845. headkey = moreColonReg.ReplaceAllString(headkey, ":")
  846. colonIndexs := regDivision.FindAllStringIndex(headkey, -1)
  847. if len(colonIndexs) > 1 {
  848. headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]]
  849. }
  850. indexKeyStringMap[key[5]] = headkey
  851. indexKeyIntMap[key[5]] = key[1]
  852. }
  853. }
  854. indexs = getPkgIndex(indexs)
  855. for ik, iv := range indexs {
  856. if indexKeyStringMap[iv] != "" {
  857. continue
  858. }
  859. if indexKeyIntMap[iv] == indexMap[iv] {
  860. continue
  861. }
  862. if ik > 0 {
  863. indexKeyStringMap[iv] = indexKeyStringMap[indexs[ik-1]]
  864. }
  865. }
  866. //获取截取标识
  867. surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con)
  868. //查找分包内容,分kv
  869. for _, iv := range indexs {
  870. text := indexTextMap[iv]
  871. tmptext := text
  872. //
  873. warpIndex := regSpliteSegment.FindAllStringIndex(text, -1)
  874. if len(indexWarpMap) > 0 {
  875. maxWarpCount = indexWarpMap[iv]
  876. }
  877. if maxWarpCount > 0 && len(warpIndex) >= 5 && len(warpIndex) > maxWarpCount {
  878. textTemp := text
  879. text = textTemp[:warpIndex[maxWarpCount-1][1]]
  880. surplusText += textTemp[warpIndex[maxWarpCount-1][0]:]
  881. }
  882. for bk, bv := range pkg {
  883. //判断分包如果在这段文字里面,该段文字就属于该包的
  884. if !strings.HasPrefix(text, bv[0]) {
  885. continue
  886. }
  887. index := util.PackageNumberConvert(bk)
  888. //去掉前缀,空格必须要加,分kv的时候要用
  889. text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
  890. if strings.TrimLeft(tmptext, bv[0]) == text || strings.TrimLeft(tmptext, bv[0]+":") == text || strings.TrimLeft(tmptext, bv[0]+":") == text {
  891. var tagtitle string
  892. for i, v := range titleindexs {
  893. if i == 0 {
  894. continue
  895. }
  896. if v[0] > iv {
  897. tagtitle = con[titleindexs[i-1][0]:titleindexs[i-1][1]]
  898. break
  899. }
  900. }
  901. tagtitle = regReplAllSpace2.ReplaceAllString(tagtitle, "")
  902. if tagtitle == "" {
  903. tagtitle = title
  904. } else if strings.Contains(tagtitle, bv[0]) && title != "" {
  905. tagtitle = title
  906. }
  907. text = tagtitle + ":" + text
  908. }
  909. headKey := ""
  910. if indexKeyStringMap[iv] != "" {
  911. //if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
  912. headKey = indexKeyStringMap[iv]
  913. //}
  914. for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] {
  915. delete(indexKeyStringMap, pkgIndexMap_v)
  916. break
  917. }
  918. }
  919. //如果一块中有多个相同的包,合并到一个
  920. if (*blockPackage)[index] != nil {
  921. //合并文本
  922. (*blockPackage)[index].Text += "\n" + text
  923. //合并冒号kv
  924. colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1, isSite, codeSite)
  925. if headKey != "" {
  926. kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
  927. MergeKvTags(colonJobKv.KvTags, kvAgain.KvTags)
  928. }
  929. MergeKvTags((*blockPackage)[index].ColonKV.KvTags, colonJobKv.KvTags)
  930. //合并空格kv
  931. spaceJobKv := SspacekvEntity.Entrance(text, headKey, nil, isSite, codeSite)
  932. MergeKvTags((*blockPackage)[index].SpaceKV.KvTags, spaceJobKv.KvTags)
  933. } else {
  934. newBpkg := &util.BlockPackage{
  935. Origin: bk,
  936. Text: text,
  937. Index: index,
  938. Name: bv[0],
  939. Type: bv[1],
  940. Accuracy: accuracy,
  941. }
  942. //fmt.Println(text)
  943. finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4, isSite, codeSite)
  944. if headKey != "" {
  945. kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
  946. MergeKvTags(finalKv.KvTags, kvAgain.KvTags)
  947. }
  948. //kv-字段-
  949. newBpkg.ColonKV = finalKv
  950. newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil, isSite, codeSite)
  951. (*blockPackage)[index] = newBpkg
  952. }
  953. }
  954. }
  955. //中标人排序
  956. //if isFindWinnerOrder && blockPackage != nil && len(*blockPackage) > 0 {
  957. // for _, v := range *blockPackage {
  958. // v.WinnerOrder = winnerOrderEntity.Find(v.Text, true, 2, isSite, codeSite)
  959. // }
  960. //}
  961. return true, surplusText
  962. }
  963. func getPkgIndex(indexs []int) []int {
  964. sort.Ints(indexs)
  965. indexsNew := []int{}
  966. count := 0
  967. for k, v := range indexs {
  968. if k > 0 && v-indexs[k-1] <= 10 {
  969. count++
  970. continue
  971. }
  972. indexsNew = append(indexsNew, v)
  973. }
  974. if count > 0 && count == len(indexs)-1 {
  975. return []int{}
  976. }
  977. return indexsNew
  978. }
  979. //每个包对应的结束位置,都是整行结束
  980. func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[string][]int, startEndMap map[int]int, con string) (string, int, map[int]string, map[int]int) {
  981. //util.Debug(con)
  982. surplusText := ""
  983. indexTextMap := map[int]string{}
  984. indexWarpMap := map[int]int{}
  985. maxWarpCount := 0
  986. for ik, iv := range indexs {
  987. text := ""
  988. if ik < len(indexs)-1 {
  989. if startEndMap[indexs[ik+1]] != 0 {
  990. text = con[iv:startEndMap[indexs[ik+1]]]
  991. } else {
  992. text = con[iv:indexs[ik+1]]
  993. }
  994. } else {
  995. text = con[iv:]
  996. }
  997. //fmt.Println(text)
  998. tmptext := text
  999. //if strings.Contains(text, "、") {
  1000. // text = strings.Split(text, "、")[0]
  1001. //} else
  1002. if strings.Contains(text, "\n") {
  1003. texts := strings.Split(text, "\n")
  1004. text2 := ""
  1005. if ik+1 < len(indexs)-1 {
  1006. if startEndMap[indexs[ik+1+1]] != 0 {
  1007. text2 = con[startEndMap[indexs[ik+1]]:startEndMap[indexs[ik+1+1]]]
  1008. } else {
  1009. text2 = con[indexs[ik+1]:indexs[ik+1+1]]
  1010. }
  1011. if texts[len(texts)-1] == text2 {
  1012. text = texts[0]
  1013. }
  1014. }
  1015. }
  1016. if utf8.RuneCountInString(text) < 5 {
  1017. indexTextMap[iv] = tmptext
  1018. } else {
  1019. indexTextMap[iv] = text
  1020. }
  1021. warpCount := len(regSpliteSegment.FindAllStringIndex(text, -1))
  1022. if warpCount > maxWarpCount {
  1023. maxWarpCount = warpCount
  1024. }
  1025. indexWarpMap[iv] = warpCount
  1026. if ik == 0 {
  1027. surplusText += con[:iv]
  1028. }
  1029. }
  1030. pkgLaw := ""
  1031. if len(pkgIndexMap) > 1 {
  1032. //有规律的出现 AB or ABAB
  1033. if pkgLaw == "" {
  1034. prevVal := ""
  1035. notRepeatCount, currentIndex, onceMax, allMax := 0, -1, 0, 0
  1036. indexMaxMap := map[int]int{}
  1037. for ik, iv := range indexs {
  1038. if notRepeatCount == len(pkgIndexMap) {
  1039. notRepeatCount = 0
  1040. }
  1041. if prevVal != indexPkgMap[iv] {
  1042. notRepeatCount++
  1043. } else {
  1044. notRepeatCount = -1
  1045. currentIndex = ik
  1046. break
  1047. }
  1048. prevVal = indexPkgMap[iv]
  1049. if notRepeatCount == len(pkgIndexMap) {
  1050. indexMaxMap[iv] = onceMax
  1051. onceMax = 0
  1052. }
  1053. if indexWarpMap[iv] > onceMax {
  1054. onceMax = indexWarpMap[iv]
  1055. allMax = onceMax
  1056. }
  1057. if ik == len(indexs)-1 && notRepeatCount != len(pkgIndexMap) {
  1058. notRepeatCount = -2
  1059. currentIndex = ik
  1060. }
  1061. }
  1062. //util.Debug(allMax, currentIndex, indexWarpMap, indexMaxMap)
  1063. if len(indexMaxMap) > 0 {
  1064. pkgLaw = "AB"
  1065. thisMax := 0
  1066. for ik := len(indexs) - 1; ik >= 0; ik-- {
  1067. iv := indexs[ik]
  1068. if currentIndex != -1 && ik >= currentIndex {
  1069. indexWarpMap[iv] = allMax
  1070. continue
  1071. }
  1072. if indexMaxMap[iv] > 0 {
  1073. thisMax = indexMaxMap[iv]
  1074. }
  1075. indexWarpMap[iv] = thisMax
  1076. }
  1077. }
  1078. }
  1079. }
  1080. if pkgLaw == "" {
  1081. indexWarpMap = map[int]int{}
  1082. }
  1083. //util.Debug(pkgLaw, maxWarpCount, indexTextMap, indexWarpMap)
  1084. return surplusText, maxWarpCount, indexTextMap, indexWarpMap
  1085. }
  1086. //分块之后的kv
  1087. func kvAfterDivideBlock(tp, text string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) []*util.Kv {
  1088. blocks, _ := DivideBlock(tp, text, from, ruleBlock, isSite, codeSite)
  1089. kvs := []*util.Kv{}
  1090. for _, v := range blocks {
  1091. //util.Debug(v.Text)
  1092. // for _, vvv := range v.ColonKV.Kvs {
  1093. // util.Debug(vvv.Key, vvv.Value, vvv.Title)
  1094. // }
  1095. kvs = append(kvs, v.ColonKV.Kvs...)
  1096. }
  1097. return kvs
  1098. }