analystep.go 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. /**
  2. 信息预处理入口
  3. **/
  4. package pretreated
  5. import (
  6. "encoding/json"
  7. "jy/util"
  8. "strings"
  9. "github.com/PuerkitoBio/goquery"
  10. )
  11. func AnalyStart(job *util.Job) {
  12. con := job.Content
  13. //全文的需要修复表格
  14. con = RepairCon(con)
  15. //格式化正文
  16. con = formatText(con, "all")
  17. job.Content = con
  18. //计算表格占比,返回表格数组、占比
  19. tabs, ration := ComputeConRatio(con, 1)
  20. if len(tabs) > 0 {
  21. newcon, newtabs, newration := FindBigText(con, ration, tabs)
  22. if newcon != "" {
  23. con = newcon
  24. tabs = newtabs
  25. ration = newration
  26. }
  27. }
  28. blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
  29. if len(blockArrays) > 0 { //有分块
  30. //从块里面找分包
  31. job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
  32. for _, bl := range blockArrays {
  33. //log.Println(bl.Text)
  34. if len([]rune(bl.Text)) > 80 {
  35. bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock)
  36. for _, bl_bl := range bl.Block {
  37. processTableInBlock(bl_bl, job)
  38. }
  39. }
  40. processTableInBlock(bl, job)
  41. //新加 未分块table中未能解析到中标候选人,从正文中解析
  42. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  43. bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
  44. }
  45. job.Block = append(job.Block, bl)
  46. }
  47. } else { //未分块,创建分块
  48. bl := &util.Block{}
  49. newCon := con
  50. if len(tabs) > 0 { //解析表格逻辑
  51. job.HasTable = 1 //添加标识:文本中有table
  52. newCon = TextAfterRemoveTable(con)
  53. job.BlockPackage = FindPackageFromText(job.Title, newCon)
  54. for i := 0; i < len(tabs); i++ {
  55. //添加标识:文本中有table
  56. tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
  57. processTableResult(tabres, bl, job)
  58. }
  59. // for k, v := range bl.TableKV.Kv {
  60. // log.Println("bl.TableKV.Kv", k, v)
  61. // }
  62. } else {
  63. //从正文里面找分包
  64. job.BlockPackage = FindPackageFromText(job.Title, newCon)
  65. }
  66. FindProjectCode(newCon, job) //匹配项目编号
  67. bl.Text = HtmlToText(con)
  68. //调用kv解析
  69. bl.ColonKV = GetKVAll(newCon, "", nil, 1)
  70. bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
  71. //新加 未分块table中未能解析到中标候选人,从正文中解析
  72. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  73. bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
  74. }
  75. //log.Println(bl.Text)
  76. job.Block = append(job.Block, bl)
  77. }
  78. }
  79. func processTableInBlock(bl *util.Block, job *util.Job) {
  80. //块中再查找表格(块,处理完把值赋到块)
  81. tabs, _ := ComputeConRatio(bl.Text, 2)
  82. for _, tab := range tabs {
  83. job.HasTable = 1
  84. //添加标识:文本中有table
  85. tabres := AnalyTableV2(tab, job.Category, bl.Title, tab.Text(), 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
  86. processTableResult(tabres, bl, job) //分析table解析结果
  87. if bl.Title == "" && tabres.BlockTag != "" {
  88. bl.Title = tabres.BlockTag
  89. }
  90. }
  91. }
  92. //匹配项目编号
  93. func FindProjectCode(newCon string, job *util.Job) {
  94. newCon = TextAfterRemoveTable(newCon)
  95. if strings.TrimSpace(newCon) == "" {
  96. return
  97. }
  98. var proCode string
  99. proCode = projectcodeReg.FindString(newCon)
  100. blCode := &util.Block{}
  101. if proCode != "" {
  102. ckv := GetKVAll(proCode, job.Title, nil, 1)
  103. blCode.ColonKV = ckv
  104. blCode.Text = proCode
  105. job.Block = append(job.Block, blCode)
  106. } else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
  107. ckv := GetKVAll(proCode, job.Title, nil, 1)
  108. blCode.ColonKV = ckv
  109. blCode.Text = proCode
  110. job.Block = append(job.Block, blCode)
  111. } else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
  112. ckv := GetKVAll(proCode, job.Title, nil, 1)
  113. blCode.Text = proCode
  114. blCode.ColonKV = ckv
  115. job.Block = append(job.Block, blCode)
  116. }
  117. if proCode = jsonReg.FindString(newCon); proCode != "" {
  118. jsonMap := make(map[string]string)
  119. json.Unmarshal([]byte(proCode), &jsonMap)
  120. jobKv := util.NewJobKv()
  121. kvTags := map[string][]*util.Tag{}
  122. for k, v := range jsonMap {
  123. kvTags[k] = append(kvTags[k], &util.Tag{Key: k, Value: v})
  124. tmpkv := new(util.Kv)
  125. tmpkv.Line = k + v
  126. tmpkv.Key = k
  127. tmpkv.Value = v
  128. jobKv.Kvs = append(jobKv.Kvs, tmpkv)
  129. }
  130. jobKv.KvTags = kvTags
  131. blCode.ColonKV = jobKv
  132. job.Block = append(job.Block, blCode)
  133. }
  134. }
  135. //分析table解析结果
  136. func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
  137. //解析结果中的kv
  138. if block.TableKV == nil {
  139. block.TableKV = util.NewJobKv()
  140. }
  141. MergeKvTags(block.TableKV.KvTags, tabres.KvTags)
  142. //分包
  143. tablePackage := map[string]*util.BlockPackage{}
  144. if tabres.IsMultiPackage {
  145. //分包中的map
  146. for k, v := range tabres.PackageMap.Map {
  147. blockPackage, ok := v.(*util.BlockPackage)
  148. if !ok {
  149. continue
  150. }
  151. //解析kv
  152. //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
  153. labelKVs := []*util.Kv{}
  154. if blockPackage.TableKV != nil {
  155. for tk, tv := range blockPackage.TableKV.KvTags {
  156. for _, tvv := range tv {
  157. if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
  158. labelKVs = append(labelKVs, &util.Kv{
  159. Key: tk,
  160. Value: tvv.Value,
  161. })
  162. }
  163. }
  164. }
  165. } else {
  166. blockPackage.TableKV = util.NewJobKv()
  167. }
  168. MergeKvTags(blockPackage.TableKV.KvTags, GetKvTags(labelKVs, "", nil))
  169. tablePackage[k] = blockPackage
  170. }
  171. }
  172. //处理中标人排序
  173. wror := []map[string]interface{}{}
  174. for _, v := range tabres.WinnerOrder {
  175. entName, _ := v["entname"].(string)
  176. v["entname"] = winnerOrderEntity.clear("中标单位", entName)
  177. if price, ok := v["price"].(string); ok {
  178. v["price"] = winnerOrderEntity.clear("中标金额", price)
  179. }
  180. v["type"] = 2
  181. wror = append(wror, v)
  182. }
  183. if len(wror) > 0 {
  184. job.Winnerorder = wror
  185. }
  186. //分包
  187. if len(tablePackage) > 0 {
  188. pkgMap := map[string]*util.BlockPackage{}
  189. for tk, tv := range tablePackage {
  190. bv := job.BlockPackage[tk]
  191. if bv == nil {
  192. pkgMap[tk] = tv
  193. continue
  194. }
  195. bv.Text += "\n" + tv.Text
  196. /************table中的分包替换块里面找到的****************/
  197. //
  198. if tv.ColonKV != nil {
  199. if bv.ColonKV == nil {
  200. bv.ColonKV = util.NewJobKv()
  201. }
  202. MergeKvTags(bv.ColonKV.KvTags, tv.ColonKV.KvTags)
  203. }
  204. //
  205. if tv.TableKV != nil {
  206. if bv.TableKV == nil {
  207. bv.TableKV = util.NewJobKv()
  208. }
  209. MergeKvTags(bv.TableKV.KvTags, tv.TableKV.KvTags)
  210. }
  211. //
  212. if tv.Origin != "" {
  213. bv.Origin = tv.Origin
  214. }
  215. //
  216. if tv.Index != "" {
  217. bv.Index = tv.Index
  218. }
  219. //
  220. if tv.Type != "" {
  221. bv.Type = tv.Type
  222. }
  223. //
  224. if tv.BidStatus != "" {
  225. bv.BidStatus = tv.BidStatus
  226. }
  227. //
  228. if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 {
  229. bv.WinnerOrder = tv.WinnerOrder
  230. }
  231. }
  232. for k, v := range pkgMap {
  233. job.BlockPackage[k] = v
  234. }
  235. }
  236. //增加brand
  237. if tabres.HasKey != 0 {
  238. job.HasKey = tabres.HasKey
  239. }
  240. if tabres.HasBrand != 0 {
  241. job.HasBrand = tabres.HasBrand
  242. }
  243. if tabres.HasGoods != 0 {
  244. job.HasGoods = tabres.HasGoods
  245. }
  246. job.HasGoods = tabres.HasGoods
  247. if len(tabres.BrandData) > 0 { //分块table合并
  248. for _, v := range tabres.BrandData {
  249. job.BrandData = append(job.BrandData, v) //加入job
  250. }
  251. }
  252. }
  253. //一行多列 一列多行,按照分块逻辑处理
  254. //ration==1 遍历所有tabs,ration!=1 tabs只有一个
  255. func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
  256. if len(tabs) != 1 {
  257. return "" //5c2aca5ea5cb26b9b7a8229b
  258. }
  259. for _, tab := range tabs {
  260. content := ""
  261. tbody := tab.ChildrenFiltered("tbody,thead")
  262. var tr *goquery.Selection
  263. if tbody.Length() == 1 {
  264. tr = tbody.ChildrenFiltered("tr")
  265. } else {
  266. tr = tab.ChildrenFiltered("tr")
  267. }
  268. if tr.Length() == 1 {
  269. tds := tr.ChildrenFiltered("td")
  270. tds.Each(func(index int, sn *goquery.Selection) {
  271. ret, _ := sn.Html()
  272. if strings.TrimSpace(ret) != "" {
  273. content += ret + "\n"
  274. }
  275. })
  276. } else {
  277. flag := true
  278. tr.EachWithBreak(func(index int, sn *goquery.Selection) bool {
  279. th := sn.ChildrenFiltered("th")
  280. td := sn.ChildrenFiltered("td")
  281. if th.Length() > 0 || td.Length() > 1 {
  282. flag = false
  283. return false
  284. } else if td.Length() == 1 {
  285. ret, _ := td.Html()
  286. if strings.TrimSpace(ret) != "" {
  287. content += ret + "\n"
  288. }
  289. }
  290. return true
  291. })
  292. if !flag {
  293. return ""
  294. }
  295. }
  296. if content != "" {
  297. content = regMoreWrap.ReplaceAllString(content, "\n")
  298. content = regEndWrap.ReplaceAllString(content, "")
  299. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  300. doc.Find("table").Eq(0).ReplaceWithHtml(content)
  301. con, _ = doc.Find("body").Html()
  302. }
  303. }
  304. return con
  305. }
  306. //查找大文本,5次
  307. func FindBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) {
  308. content = tableDivideBlock(con, r, t)
  309. if content == "" {
  310. return
  311. }
  312. for i := 0; i < 4; i++ {
  313. if content != "" {
  314. tabs, ration = ComputeConRatio(content, 1)
  315. if len(tabs) > 0 {
  316. con := tableDivideBlock(content, ration, tabs)
  317. if con == "" {
  318. return
  319. } else {
  320. content = con
  321. }
  322. } else {
  323. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  324. content = doc.Text()
  325. return
  326. }
  327. } else {
  328. return
  329. }
  330. }
  331. return
  332. }