analystep.go 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. /**
  2. 信息预处理入口
  3. **/
  4. package pretreated
  5. import (
  6. "jy/util"
  7. qutil "qfw/util"
  8. "strings"
  9. "github.com/PuerkitoBio/goquery"
  10. )
  11. func AnalyStart(job *util.Job) {
  12. con := job.Content
  13. //全文的需要修复表格
  14. con = RepairCon(con)
  15. //格式化正文
  16. con = formatText(con, "all")
  17. job.Content = con
  18. //计算表格占比,返回表格数组、占比
  19. tabs, ration := ComputeConRatio(con, 1)
  20. if len(tabs) > 0 {
  21. newcon, newtabs, newration := FindBigText(con, ration, tabs)
  22. if newcon != "" {
  23. con = newcon
  24. tabs = newtabs
  25. ration = newration
  26. }
  27. }
  28. blockArrays, _ := DivideBlock(con, 1, job.RuleBlock) //分块
  29. if len(blockArrays) > 0 { //有分块
  30. //从块里面找分包
  31. job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
  32. for _, bl := range blockArrays {
  33. if len([]rune(bl.Text)) > 80 {
  34. bl.Block, _ = DivideBlock(bl.Text, 1, job.RuleBlock)
  35. }
  36. //块中再查找表格(块,处理完把值赋到块)
  37. t1, _ := ComputeConRatio(bl.Text, 2)
  38. if len(t1) > 0 {
  39. job.HasTable = 1 //添加标识:文本中有table
  40. tabres := AnalyTableV2(t1, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
  41. processTableResult(tabres, bl, job) //分析table解析结果
  42. if bl.Title == "" && tabres.BlockTag != "" {
  43. bl.Title = tabres.BlockTag
  44. }
  45. // for k, v := range bl.TableKV.Kv {
  46. // log.Println("bl.TableKV.Kv", k, v)
  47. // }
  48. }
  49. job.Block = append(job.Block, bl)
  50. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  51. //新加table未找到winnerorder, 从分块文本中找中标候选人
  52. job.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
  53. }
  54. }
  55. } else { //未分块,创建分块
  56. bl := &util.Block{}
  57. newCon := con
  58. if len(tabs) > 0 { //解析表格逻辑
  59. job.HasTable = 1 //添加标识:文本中有table
  60. newCon = TextAfterRemoveTable(con)
  61. job.BlockPackage = FindPackageFromText(job.Title, newCon)
  62. tabres := AnalyTableV2(tabs, job.Category, "", con, 1, job.SourceMid, job.RuleBlock)
  63. processTableResult(tabres, bl, job)
  64. // for k, v := range bl.TableKV.Kv {
  65. // log.Println("bl.TableKV.Kv", k, v)
  66. // }
  67. } else {
  68. //从正文里面找分包
  69. job.BlockPackage = FindPackageFromText(job.Title, newCon)
  70. }
  71. //新加 未分块table中未能解析到中标候选人,从正文中解析
  72. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  73. bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
  74. }
  75. //调用kv解析
  76. bl.ColonKV = GetKVAll(newCon, "", nil, 1)
  77. bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
  78. job.Block = append(job.Block, bl)
  79. }
  80. }
  81. //分析table解析结果
  82. func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
  83. //解析结果中的kv
  84. kv := map[string]string{}
  85. for k, v := range tabres.SortKV.Map {
  86. kv[k] = qutil.ObjToString(v)
  87. }
  88. kvIndex := map[string]int{}
  89. for k, v := range tabres.SortKVWeight {
  90. kvIndex[k] = v
  91. }
  92. block.TableKV = &util.JobKv{Kv: kv, KvIndex: kvIndex}
  93. //分包
  94. tablePackage := map[string]*util.BlockPackage{}
  95. if tabres.IsMultiPackage {
  96. //分包中的map
  97. for k, v := range tabres.PackageMap.Map {
  98. blockPackage, ok := v.(*util.BlockPackage)
  99. if !ok {
  100. continue
  101. }
  102. //解析kv
  103. //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
  104. labelKVs := []*util.Kv{}
  105. if blockPackage.TableKV != nil && blockPackage.TableKV.Kv != nil {
  106. for tk, tv := range blockPackage.TableKV.Kv {
  107. if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
  108. labelKVs = append(labelKVs, &util.Kv{
  109. Key: tk,
  110. Value: tv,
  111. })
  112. }
  113. }
  114. }
  115. labelKV, _ := KvTagsToKV(labelKVs, "", nil, 2)
  116. for lk, lv := range labelKV {
  117. if blockPackage.TableKV.Kv[lk] != "" {
  118. continue
  119. }
  120. blockPackage.TableKV.Kv[lk] = lv
  121. }
  122. tablePackage[k] = blockPackage
  123. }
  124. }
  125. //处理中标人排序
  126. wror := []map[string]interface{}{}
  127. for _, v := range tabres.WinnerOrder {
  128. entName, _ := v["entname"].(string)
  129. v["entname"] = winnerOrderEntity.clear("中标单位", entName)
  130. if price, ok := v["price"].(string); ok {
  131. v["price"] = winnerOrderEntity.clear("中标金额", price)
  132. }
  133. v["type"] = 2
  134. wror = append(wror, v)
  135. }
  136. if len(wror) > 0 {
  137. job.Winnerorder = wror
  138. }
  139. //分包
  140. if len(tablePackage) > 0 {
  141. pkgMap := map[string]*util.BlockPackage{}
  142. for tk, tv := range tablePackage {
  143. bv := job.BlockPackage[tk]
  144. if bv == nil {
  145. pkgMap[tk] = tv
  146. continue
  147. }
  148. bv.Text += "\n" + tv.Text
  149. /************table中的分包替换块里面找到的****************/
  150. //
  151. if tv.ColonKV != nil {
  152. if bv.ColonKV == nil {
  153. bv.ColonKV = util.NewJobKv()
  154. }
  155. for k, v := range tv.ColonKV.Kv {
  156. if bv.ColonKV.Kv[k] != "" {
  157. continue
  158. }
  159. bv.ColonKV.Kv[k] = v
  160. }
  161. }
  162. //
  163. if tv.TableKV != nil {
  164. if bv.TableKV == nil {
  165. bv.TableKV = util.NewJobKv()
  166. }
  167. for k, v := range tv.TableKV.Kv {
  168. if bv.TableKV.Kv[k] != "" {
  169. continue
  170. }
  171. bv.TableKV.Kv[k] = v
  172. }
  173. }
  174. //
  175. if tv.Origin != "" {
  176. bv.Origin = tv.Origin
  177. }
  178. //
  179. if tv.Index != "" {
  180. bv.Index = tv.Index
  181. }
  182. //
  183. if tv.Type != "" {
  184. bv.Type = tv.Type
  185. }
  186. //
  187. if tv.BidStatus != "" {
  188. bv.BidStatus = tv.BidStatus
  189. }
  190. //
  191. if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 {
  192. bv.WinnerOrder = tv.WinnerOrder
  193. }
  194. }
  195. for k, v := range pkgMap {
  196. job.BlockPackage[k] = v
  197. }
  198. }
  199. //增加brand
  200. if tabres.HasKey != 0 {
  201. job.HasKey = tabres.HasKey
  202. }
  203. if tabres.HasBrand != 0 {
  204. job.HasBrand = tabres.HasBrand
  205. }
  206. if tabres.HasGoods != 0 {
  207. job.HasGoods = tabres.HasGoods
  208. }
  209. job.HasGoods = tabres.HasGoods
  210. if len(tabres.BrandData) > 0 { //分块table合并
  211. for _, v := range tabres.BrandData {
  212. job.BrandData = append(job.BrandData, v) //加入job
  213. }
  214. }
  215. }
  216. //一行多列 一列多行,按照分块逻辑处理
  217. //ration==1 遍历所有tabs,ration!=1 tabs只有一个
  218. func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
  219. if len(tabs) != 1 {
  220. //return ""
  221. }
  222. for _, tab := range tabs {
  223. content := ""
  224. tbody := tab.ChildrenFiltered("tbody,thead")
  225. var tr *goquery.Selection
  226. if tbody.Length() == 1 {
  227. tr = tbody.ChildrenFiltered("tr")
  228. } else {
  229. tr = tab.ChildrenFiltered("tr")
  230. }
  231. if tr.Length() == 1 {
  232. tds := tr.ChildrenFiltered("td")
  233. tds.Each(func(index int, sn *goquery.Selection) {
  234. ret, _ := sn.Html()
  235. if strings.TrimSpace(ret) != "" {
  236. content += ret + "\n"
  237. }
  238. })
  239. } else {
  240. flag := true
  241. tr.EachWithBreak(func(index int, sn *goquery.Selection) bool {
  242. th := sn.ChildrenFiltered("th")
  243. td := sn.ChildrenFiltered("td")
  244. if th.Length() > 0 || td.Length() > 1 {
  245. flag = false
  246. return false
  247. } else if td.Length() == 1 {
  248. ret, _ := td.Html()
  249. if strings.TrimSpace(ret) != "" {
  250. content += ret + "\n"
  251. }
  252. }
  253. return true
  254. })
  255. if !flag {
  256. return ""
  257. }
  258. }
  259. if content != "" {
  260. content = regMoreWrap.ReplaceAllString(content, "\n")
  261. content = regEndWrap.ReplaceAllString(content, "")
  262. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  263. doc.Find("table").Eq(0).ReplaceWithHtml(content)
  264. con, _ = doc.Find("body").Html()
  265. }
  266. }
  267. return con
  268. }
  269. //查找大文本,5次
  270. func FindBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) {
  271. content = tableDivideBlock(con, r, t)
  272. if content == "" {
  273. return
  274. }
  275. for i := 0; i < 4; i++ {
  276. if content != "" {
  277. tabs, ration = ComputeConRatio(content, 1)
  278. if len(tabs) > 0 {
  279. con := tableDivideBlock(content, ration, tabs)
  280. if con == "" {
  281. return
  282. } else {
  283. content = con
  284. }
  285. } else {
  286. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  287. content = doc.Text()
  288. return
  289. }
  290. } else {
  291. return
  292. }
  293. }
  294. return
  295. }