analystep.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. /**
  2. 信息预处理入口
  3. **/
  4. package pretreated
  5. import (
  6. "fmt"
  7. "jy/util"
  8. qutil "qfw/util"
  9. "strings"
  10. "github.com/PuerkitoBio/goquery"
  11. )
  12. func AnalyStart(job *util.Job) {
  13. con := job.Content
  14. //全文的需要修复表格
  15. con = RepairCon(con)
  16. //格式化正文
  17. con = formatText(con, "all")
  18. job.Content = con
  19. //
  20. tabs, ration := ComputeConRatio(con, 1)
  21. if len(tabs) > 0 {
  22. newcon, newtabs, newration := findBigText(con, ration, tabs)
  23. if newcon != "" && newration == 0 {
  24. con = newcon
  25. tabs = newtabs
  26. ration = newration
  27. }
  28. }
  29. blockArrays, _ := DivideBlock(con, 1)
  30. if len(blockArrays) > 0 { //有分块
  31. //从块里面找分包
  32. job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title)
  33. for _, bl := range blockArrays {
  34. if len([]rune(bl.Text)) > 80 {
  35. ba1, _ := DivideBlock(bl.Text, 1)
  36. if len(ba1) > 0 {
  37. t := ""
  38. for _, t1 := range ba1 {
  39. t += t1.Text
  40. }
  41. bl.Text = t
  42. bl.ColonKV = GetKVAll(t, bl.Title, 1)
  43. bl.SpaceKV = SspacekvEntity.Entrance(t, bl.Title)
  44. }
  45. }
  46. //块中再查找表格(块,处理完把值赋到块)
  47. t1, _ := ComputeConRatio(bl.Text, 2)
  48. if len(t1) > 0 {
  49. job.HasTable = 1 //添加标识:文本中有table
  50. fmt.Println("分块")
  51. tabres := AnalyTableV2(t1, job.Category, bl.Title, bl.Text, 2, job.SourceMid)
  52. processTableResult(tabres, bl, job)
  53. if bl.Title == "" && tabres.BlockTag != "" {
  54. bl.Title = tabres.BlockTag
  55. }
  56. // for k, v := range bl.TableKV.Kv {
  57. // log.Println("bl.TableKV.Kv", k, v)
  58. // }
  59. }
  60. job.Block = append(job.Block, bl)
  61. }
  62. } else { //未分块,创建分块
  63. bl := &util.Block{}
  64. newCon := con
  65. if len(tabs) > 0 { //解析表格逻辑
  66. job.HasTable = 1 //添加标识:文本中有table
  67. newCon = TextAfterRemoveTable(con)
  68. job.BlockPackage = FindPackageFromText(job.Title, newCon)
  69. fmt.Println("未分块")
  70. tabres := AnalyTableV2(tabs, job.Category, "", con, 1, job.SourceMid)
  71. processTableResult(tabres, bl, job)
  72. // for k, v := range bl.TableKV.Kv {
  73. // log.Println("bl.TableKV.Kv", k, v)
  74. // }
  75. } else {
  76. //从正文里面找分包
  77. job.BlockPackage = FindPackageFromText(job.Title, newCon)
  78. }
  79. //调用kv解析
  80. bl.ColonKV = GetKVAll(newCon, "", 1)
  81. bl.SpaceKV = SspacekvEntity.Entrance(newCon, "")
  82. job.Block = append(job.Block, bl)
  83. }
  84. }
  85. //分析table解析结果
  86. func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
  87. //解析结果中的kv
  88. fmt.Println("-------------", tabres.SortKV.Keys)
  89. kv := map[string]string{}
  90. for k, v := range tabres.SortKV.Map {
  91. kv[k] = qutil.ObjToString(v)
  92. }
  93. kvIndex := map[string]int{}
  94. for k, v := range tabres.SortKVWeight {
  95. kvIndex[k] = v
  96. }
  97. block.TableKV = &util.JobKv{Kv: kv, KvIndex: kvIndex}
  98. //分包
  99. tablePackage := map[string]*util.BlockPackage{}
  100. if tabres.IsMultiPackage {
  101. //分包中的map
  102. for k, v := range tabres.PackageMap.Map {
  103. blockPackage, ok := v.(*util.BlockPackage)
  104. if !ok {
  105. continue
  106. }
  107. //解析kv
  108. //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
  109. labelKVs := []*util.Kv{}
  110. if blockPackage.TableKV != nil && blockPackage.TableKV.Kv != nil {
  111. for tk, tv := range blockPackage.TableKV.Kv {
  112. if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
  113. labelKVs = append(labelKVs, &util.Kv{
  114. Key: tk,
  115. Value: tv,
  116. })
  117. }
  118. }
  119. }
  120. labelKV, _ := KvTagsToKV(labelKVs, "", nil, 2)
  121. for lk, lv := range labelKV {
  122. if blockPackage.TableKV.Kv[lk] != "" {
  123. continue
  124. }
  125. blockPackage.TableKV.Kv[lk] = lv
  126. }
  127. tablePackage[k] = blockPackage
  128. }
  129. }
  130. //处理中标人排序
  131. wror := []map[string]interface{}{}
  132. for _, v := range tabres.WinnerOrder {
  133. entName, _ := v["entname"].(string)
  134. v["entname"] = winnerOrderEntity.clear("中标单位", entName)
  135. if price, ok := v["price"].(string); ok {
  136. v["price"] = winnerOrderEntity.clear("中标金额", price)
  137. }
  138. v["type"] = 2
  139. wror = append(wror, v)
  140. }
  141. if len(wror) > 0 {
  142. job.Winnerorder = wror
  143. }
  144. //分包
  145. if len(tablePackage) > 0 {
  146. pkgMap := map[string]*util.BlockPackage{}
  147. for tk, tv := range tablePackage {
  148. bv := job.BlockPackage[tk]
  149. if bv == nil {
  150. pkgMap[tk] = tv
  151. continue
  152. }
  153. bv.Text += "\n" + tv.Text
  154. /************table中的分包替换块里面找到的****************/
  155. //
  156. if tv.ColonKV != nil {
  157. if bv.ColonKV == nil {
  158. bv.ColonKV = util.NewJobKv()
  159. }
  160. for k, v := range tv.ColonKV.Kv {
  161. if bv.ColonKV.Kv[k] != "" {
  162. continue
  163. }
  164. bv.ColonKV.Kv[k] = v
  165. }
  166. }
  167. //
  168. if tv.TableKV != nil {
  169. if bv.TableKV == nil {
  170. bv.TableKV = util.NewJobKv()
  171. }
  172. for k, v := range tv.TableKV.Kv {
  173. if bv.TableKV.Kv[k] != "" {
  174. continue
  175. }
  176. bv.TableKV.Kv[k] = v
  177. }
  178. }
  179. //
  180. if tv.Origin != "" {
  181. bv.Origin = tv.Origin
  182. }
  183. //
  184. if tv.Index != "" {
  185. bv.Index = tv.Index
  186. }
  187. //
  188. if tv.Type != "" {
  189. bv.Type = tv.Type
  190. }
  191. //
  192. if tv.BidStatus != "" {
  193. bv.BidStatus = tv.BidStatus
  194. }
  195. //
  196. if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 {
  197. bv.WinnerOrder = tv.WinnerOrder
  198. }
  199. }
  200. for k, v := range pkgMap {
  201. job.BlockPackage[k] = v
  202. }
  203. }
  204. //增加brand
  205. if tabres.HasKey != 0 {
  206. job.HasKey = tabres.HasKey
  207. }
  208. if tabres.HasBrand != 0 {
  209. job.HasBrand = tabres.HasBrand
  210. }
  211. if tabres.HasGoods != 0 {
  212. job.HasGoods = tabres.HasGoods
  213. }
  214. job.HasGoods = tabres.HasGoods
  215. if len(tabres.BrandData) > 0 { //分块table合并
  216. for _, v := range tabres.BrandData {
  217. job.BrandData = append(job.BrandData, v) //加入job
  218. }
  219. }
  220. }
  221. //一行多列 一列多行,按照分块逻辑处理
  222. //ration==1 遍历所有tabs,ration!=1 tabs只有一个
  223. func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
  224. if len(tabs) != 1 {
  225. return ""
  226. }
  227. for _, tab := range tabs {
  228. content := ""
  229. tbody := tab.ChildrenFiltered("tbody,thead")
  230. var tr *goquery.Selection
  231. if tbody.Length() == 1 {
  232. tr = tbody.ChildrenFiltered("tr")
  233. } else {
  234. tr = tab.ChildrenFiltered("tr")
  235. }
  236. if tr.Length() == 1 {
  237. tds := tr.ChildrenFiltered("td")
  238. tds.Each(func(index int, sn *goquery.Selection) {
  239. ret, _ := sn.Html()
  240. if strings.TrimSpace(ret) != "" {
  241. content += ret + "\n"
  242. }
  243. })
  244. } else {
  245. flag := true
  246. tr.EachWithBreak(func(index int, sn *goquery.Selection) bool {
  247. th := sn.ChildrenFiltered("th")
  248. td := sn.ChildrenFiltered("td")
  249. if th.Length() > 0 || td.Length() > 1 {
  250. flag = false
  251. return false
  252. } else if td.Length() == 1 {
  253. ret, _ := td.Html()
  254. if strings.TrimSpace(ret) != "" {
  255. content += ret + "\n"
  256. }
  257. }
  258. return true
  259. })
  260. if !flag {
  261. return ""
  262. }
  263. }
  264. if content != "" {
  265. content = regMoreWrap.ReplaceAllString(content, "\n")
  266. content = regEndWrap.ReplaceAllString(content, "")
  267. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  268. doc.Find("table").Eq(0).ReplaceWithHtml(content)
  269. con, _ = doc.Html()
  270. }
  271. }
  272. return con
  273. }
  274. //查找大文本,5次
  275. func findBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) {
  276. content = tableDivideBlock(con, r, t)
  277. if content == "" {
  278. return
  279. }
  280. for i := 0; i < 4; i++ {
  281. if content != "" {
  282. tabs, ration = ComputeConRatio(content, 1)
  283. if len(tabs) > 0 {
  284. content = tableDivideBlock(content, ration, tabs)
  285. if content == "" {
  286. return
  287. }
  288. } else {
  289. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  290. content = doc.Text()
  291. return
  292. }
  293. } else {
  294. return
  295. }
  296. }
  297. return
  298. }