analystep.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. /**
  2. 信息预处理入口
  3. **/
  4. package pretreated
  5. import (
  6. "encoding/json"
  7. "jy/util"
  8. //"log"
  9. "strings"
  10. "github.com/PuerkitoBio/goquery"
  11. )
  12. func AnalyStart(job *util.Job,isSite bool,codeSite string) {
  13. con := job.Content
  14. //全文的需要修复表格
  15. con = RepairCon(con)
  16. //格式化正文
  17. con = formatText(con, "all")
  18. job.Content = con
  19. //计算表格占比,返回表格数组、占比
  20. tabs, ration := ComputeConRatio(con, 1)
  21. if len(tabs) > 0 {
  22. newcon, newtabs, newration := FindBigText(con, ration, tabs)
  23. if newcon != "" {
  24. con = newcon
  25. tabs = newtabs
  26. ration = newration
  27. }
  28. }
  29. blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock,isSite,codeSite) //分块
  30. if len(blockArrays) > 0 { //有分块
  31. //从块里面找分包
  32. job.BlockPackage = FindPackageFromBlocks(&blockArrays,isSite,codeSite) //从块里面找分包
  33. for _, bl := range blockArrays {
  34. //log.Println(bl.Text)
  35. if len([]rune(bl.Text)) > 80 {
  36. bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock,isSite,codeSite)
  37. for _, bl_bl := range bl.Block {
  38. processTableInBlock(bl_bl, job, false,isSite,codeSite)
  39. }
  40. }
  41. FindProjectCode(bl.Text, job) //匹配项目编号
  42. processTableInBlock(bl, job, true,isSite,codeSite)
  43. //新加 未分块table中未能解析到中标候选人,从正文中解析
  44. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  45. bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1,isSite,codeSite)
  46. job.Winnerorder = bl.Winnerorder
  47. }
  48. job.Block = append(job.Block, bl)
  49. }
  50. } else { //未分块,创建分块
  51. //log.Println(con)
  52. bl := &util.Block{}
  53. newCon := con
  54. if len(tabs) > 0 { //解析表格逻辑
  55. job.HasTable = 1 //添加标识:文本中有table
  56. newCon = TextAfterRemoveTable(con)
  57. job.BlockPackage = FindPackageFromText(job.Title, newCon,isSite,codeSite)
  58. for i := 0; i < len(tabs); i++ {
  59. //log.Println(tabs[i].Text())
  60. //添加标识:文本中有table
  61. tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock,isSite,codeSite) //解析表格入口 返回:汇总表格对象
  62. processTableResult(tabres, bl, job,isSite,codeSite)
  63. }
  64. } else {
  65. //从正文里面找分包
  66. job.BlockPackage = FindPackageFromText(job.Title, newCon,isSite,codeSite)
  67. }
  68. bl.Text = HtmlToText(con)
  69. //log.Println(bl.Text)
  70. FindProjectCode(bl.Text, job) //匹配项目编号
  71. //调用kv解析
  72. bl.ColonKV = GetKVAll(bl.Text, "", nil, 1,isSite,codeSite)
  73. bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil,isSite,codeSite)
  74. //新加 未分块table中未能解析到中标候选人,从正文中解析
  75. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  76. bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1,isSite,codeSite)
  77. job.Winnerorder = bl.Winnerorder
  78. }
  79. job.Block = append(job.Block, bl)
  80. }
  81. }
  82. func processTableInBlock(bl *util.Block, job *util.Job, packageFlag,isSite bool,codeSite string) {
  83. //块中再查找表格(块,处理完把值赋到块)
  84. tabs, _ := ComputeConRatio(bl.Text, 2)
  85. for _, tab := range tabs {
  86. job.HasTable = 1
  87. tmptag := ""
  88. if tab.Nodes[0] != nil && tab.Nodes[0].PrevSibling != nil{
  89. tmptag = strings.TrimSpace(tab.Nodes[0].PrevSibling.Data)
  90. }
  91. //添加标识:文本中有table
  92. tabres := AnalyTableV2(tab, job.Category, tmptag, tab.Text(), 2, job.SourceMid, job.RuleBlock,isSite,codeSite) //解析表格入口 返回:汇总表格对象
  93. if packageFlag {
  94. tabres.PackageMap = nil
  95. tabres.IsMultiPackage = false
  96. }
  97. processTableResult(tabres, bl, job,isSite,codeSite) //分析table解析结果
  98. if bl.Title == "" && tabres.BlockTag != "" {
  99. bl.Title = tabres.BlockTag
  100. }
  101. }
  102. }
  103. //匹配项目编号
  104. func FindProjectCode(newCon string, job *util.Job) {
  105. newCon = HtmlToText(newCon)
  106. if strings.TrimSpace(newCon) == "" {
  107. return
  108. }
  109. var proCode string
  110. blCode := &util.Block{}
  111. /* if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
  112. //5d424bdfa5cb26b9b7ac7a85
  113. //5d425a48a5cb26b9b7df5fec
  114. //5d425506a5cb26b9b7cd2c3c
  115. splitStr := strings.Split(newConTMP, " ")
  116. if len(splitStr) >= 2 {
  117. if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
  118. newCon = "项目编号:" + splitStr[len(splitStr)-1]
  119. } else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
  120. //5d4253f3a5cb26b9b7ca2662
  121. newCon = "项目编号:" + tmpstr
  122. }
  123. } else if len(splitStr) == 1 {
  124. if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
  125. newCon = "项目编号:" + tmpstr
  126. } else if strings.Contains(newConTMP, "、") {
  127. tmpstrs := strings.Split(newCon, "、")
  128. newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
  129. }
  130. }
  131. }
  132. proCode = projectcodeReg.FindString(newCon)
  133. if proCode != "" {
  134. ckv := GetKVAll(proCode, job.Title, nil, 1)
  135. blCode.ColonKV = ckv
  136. blCode.Text = proCode
  137. job.Block = append(job.Block, blCode)
  138. } else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
  139. ckv := GetKVAll(proCode, job.Title, nil, 1)
  140. blCode.ColonKV = ckv
  141. blCode.Text = proCode
  142. job.Block = append(job.Block, blCode)
  143. } else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
  144. ckv := GetKVAll(proCode, job.Title, nil, 1)
  145. blCode.Text = proCode
  146. blCode.ColonKV = ckv
  147. job.Block = append(job.Block, blCode)
  148. }*/
  149. if proCode = jsonReg.FindString(newCon); proCode != "" {
  150. jsonMap := make(map[string]string)
  151. json.Unmarshal([]byte(proCode), &jsonMap)
  152. jobKv := util.NewJobKv()
  153. kvTags := map[string][]*util.Tag{}
  154. for k, v := range jsonMap {
  155. kvTags[k] = append(kvTags[k], &util.Tag{Key: k, Value: v})
  156. tmpkv := new(util.Kv)
  157. tmpkv.Line = k + v
  158. tmpkv.Key = k
  159. tmpkv.Value = v
  160. jobKv.Kvs = append(jobKv.Kvs, tmpkv)
  161. }
  162. jobKv.KvTags = kvTags
  163. blCode.ColonKV = jobKv
  164. job.Block = append(job.Block, blCode)
  165. }
  166. }
  167. //分析table解析结果
  168. func processTableResult(tabres *TableResult, block *util.Block, job *util.Job,isSite bool,codeSite string) {
  169. //解析结果中的kv
  170. if block.TableKV == nil {
  171. block.TableKV = util.NewJobKv()
  172. }
  173. MergeKvTags(block.TableKV.KvTags, tabres.KvTags)
  174. //分包
  175. tablePackage := map[string]*util.BlockPackage{}
  176. if tabres.IsMultiPackage {
  177. //分包中的map
  178. for _, v := range tabres.PackageMap.Keys {
  179. blockPackage, ok := tabres.PackageMap.Map[v].(*util.BlockPackage)
  180. if !ok {
  181. continue
  182. }
  183. //解析kv
  184. //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
  185. labelKVs := []*util.Kv{}
  186. if blockPackage.TableKV != nil {
  187. for tk, tv := range blockPackage.TableKV.KvTags {
  188. for _, tvv := range tv {
  189. if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
  190. labelKVs = append(labelKVs, &util.Kv{
  191. Key: tk,
  192. Value: tvv.Value,
  193. })
  194. }
  195. }
  196. }
  197. } else {
  198. blockPackage.TableKV = util.NewJobKv()
  199. }
  200. MergeKvTags(blockPackage.TableKV.KvTags, GetKvTags(labelKVs, "", nil,isSite,codeSite))
  201. tablePackage[v] = blockPackage
  202. }
  203. }
  204. //处理中标人排序
  205. wror := []map[string]interface{}{}
  206. for _, v := range tabres.WinnerOrder {
  207. entName, _ := v["entname"].(string)
  208. v["entname"] = winnerOrderEntity.clear("中标单位", entName)
  209. if price, ok := v["price"].(string); ok {
  210. v["price"] = winnerOrderEntity.clear("中标金额", price)
  211. }
  212. v["type"] = 2
  213. wror = append(wror, v)
  214. }
  215. if len(wror) > 0 {
  216. job.Winnerorder = wror
  217. }
  218. //分包
  219. if len(tablePackage) > 0 {
  220. pkgMap := map[string]*util.BlockPackage{}
  221. for tk, tv := range tablePackage {
  222. bv := job.BlockPackage[tk]
  223. if bv == nil {
  224. pkgMap[tk] = tv
  225. continue
  226. }
  227. bv.Text += "\n" + tv.Text
  228. /************table中的分包替换块里面找到的****************/
  229. //
  230. if tv.ColonKV != nil {
  231. if bv.ColonKV == nil {
  232. bv.ColonKV = util.NewJobKv()
  233. }
  234. MergeKvTags(bv.ColonKV.KvTags, tv.ColonKV.KvTags)
  235. }
  236. //
  237. if tv.TableKV != nil {
  238. if bv.TableKV == nil {
  239. bv.TableKV = util.NewJobKv()
  240. }
  241. MergeKvTags(bv.TableKV.KvTags, tv.TableKV.KvTags)
  242. }
  243. //
  244. if tv.Origin != "" {
  245. bv.Origin = tv.Origin
  246. }
  247. //
  248. if tv.Index != "" {
  249. bv.Index = tv.Index
  250. }
  251. //
  252. if tv.Type != "" {
  253. bv.Type = tv.Type
  254. }
  255. //
  256. if tv.BidStatus != "" {
  257. bv.BidStatus = tv.BidStatus
  258. }
  259. //
  260. if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 {
  261. bv.WinnerOrder = tv.WinnerOrder
  262. }
  263. }
  264. for k, v := range pkgMap {
  265. job.BlockPackage[k] = v
  266. }
  267. }
  268. //增加brand
  269. if tabres.HasKey != 0 {
  270. job.HasKey = tabres.HasKey
  271. }
  272. if tabres.HasBrand != 0 {
  273. job.HasBrand = tabres.HasBrand
  274. }
  275. if tabres.HasGoods != 0 {
  276. job.HasGoods = tabres.HasGoods
  277. }
  278. job.HasGoods = tabres.HasGoods
  279. if len(tabres.BrandData) > 0 { //分块table合并
  280. for _, v := range tabres.BrandData {
  281. job.BrandData = append(job.BrandData, v) //加入job
  282. }
  283. }
  284. }
  285. //一行多列 一列多行,按照分块逻辑处理
  286. //ration==1 遍历所有tabs,ration!=1 tabs只有一个
  287. func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
  288. if len(tabs) != 1 {
  289. return "" //5c2aca5ea5cb26b9b7a8229b
  290. }
  291. for _, tab := range tabs {
  292. content := ""
  293. tbody := tab.ChildrenFiltered("tbody,thead")
  294. var tr *goquery.Selection
  295. if tbody.Length() == 1 {
  296. tr = tbody.ChildrenFiltered("tr")
  297. } else {
  298. tr = tab.ChildrenFiltered("tr")
  299. }
  300. if tr.Length() == 1 {
  301. tds := tr.ChildrenFiltered("td")
  302. tds.Each(func(index int, sn *goquery.Selection) {
  303. ret, _ := sn.Html()
  304. if strings.TrimSpace(ret) != "" {
  305. content += ret + "\n"
  306. }
  307. })
  308. } else {
  309. flag := true
  310. tr.EachWithBreak(func(index int, sn *goquery.Selection) bool {
  311. th := sn.ChildrenFiltered("th")
  312. td := sn.ChildrenFiltered("td")
  313. if th.Length() > 0 || td.Length() > 1 {
  314. flag = false
  315. return false
  316. } else if td.Length() == 1 {
  317. ret, _ := td.Html()
  318. if strings.TrimSpace(ret) != "" {
  319. content += ret + "\n"
  320. }
  321. }
  322. return true
  323. })
  324. if !flag {
  325. return ""
  326. }
  327. }
  328. if content != "" {
  329. content = regMoreWrap.ReplaceAllString(content, "\n")
  330. content = regEndWrap.ReplaceAllString(content, "")
  331. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  332. doc.Find("table").Eq(0).ReplaceWithHtml(content)
  333. con, _ = doc.Find("body").Html()
  334. }
  335. }
  336. return con
  337. }
  338. //查找大文本,5次
  339. func FindBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) {
  340. content = tableDivideBlock(con, r, t)
  341. if content == "" {
  342. return
  343. }
  344. for i := 0; i < 4; i++ {
  345. if content != "" {
  346. tabs, ration = ComputeConRatio(content, 1)
  347. if len(tabs) > 0 {
  348. con := tableDivideBlock(content, ration, tabs)
  349. if con == "" {
  350. return
  351. } else {
  352. content = con
  353. }
  354. } else {
  355. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  356. content = doc.Text()
  357. return
  358. }
  359. } else {
  360. return
  361. }
  362. }
  363. return
  364. }