analystep.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. /**
  2. 信息预处理入口
  3. **/
  4. package pretreated
  5. import (
  6. "encoding/json"
  7. "jy/util"
  8. qutil "qfw/util"
  9. "strings"
  10. "github.com/PuerkitoBio/goquery"
  11. )
  12. func AnalyStart(job *util.Job) {
  13. con := job.Content
  14. //全文的需要修复表格
  15. con = RepairCon(con)
  16. //格式化正文
  17. con = formatText(con, "all")
  18. job.Content = con
  19. //计算表格占比,返回表格数组、占比
  20. tabs, ration := ComputeConRatio(con, 1)
  21. if len(tabs) > 0 {
  22. newcon, newtabs, newration := FindBigText(con, ration, tabs)
  23. if newcon != "" {
  24. con = newcon
  25. tabs = newtabs
  26. ration = newration
  27. }
  28. }
  29. blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
  30. if len(blockArrays) > 0 { //有分块
  31. //从块里面找分包
  32. job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
  33. for _, bl := range blockArrays {
  34. if len([]rune(bl.Text)) > 80 {
  35. bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock)
  36. }
  37. //块中再查找表格(块,处理完把值赋到块)
  38. t1, _ := ComputeConRatio(bl.Text, 2)
  39. if len(t1) > 0 {
  40. job.HasTable = 1
  41. for i:=0;i<len(tabs);i++{
  42. bl := &util.Block{}
  43. //添加标识:文本中有table
  44. tabres := AnalyTableV2(t1[0], job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
  45. processTableResult(tabres, bl, job) //分析table解析结果
  46. if bl.Title == "" && tabres.BlockTag != "" {
  47. bl.Title = tabres.BlockTag
  48. }
  49. if len(bl.TableKV.Kv)>0{
  50. bl.Text = tabs[i].Text()
  51. job.Block = append(job.Block, bl)
  52. }
  53. }
  54. // for k, v := range bl.TableKV.Kv {
  55. // log.Println("bl.TableKV.Kv", k, v)
  56. // }
  57. }
  58. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  59. //新加table未找到winnerorder, 从分块文本中找中标候选人
  60. job.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
  61. }
  62. job.Block = append(job.Block, bl)
  63. }
  64. } else { //未分块,创建分块
  65. bl := &util.Block{}
  66. newCon := con
  67. if len(tabs) > 0 { //解析表格逻辑
  68. job.HasTable = 1 //添加标识:文本中有table
  69. newCon = TextAfterRemoveTable(con)
  70. job.BlockPackage = FindPackageFromText(job.Title, newCon)
  71. for i:=0;i<len(tabs);i++{
  72. bl := &util.Block{}
  73. //添加标识:文本中有table
  74. tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock)//解析表格入口 返回:汇总表格对象
  75. processTableResult(tabres, bl, job) //分析table解析结果
  76. if bl.Title == "" && tabres.BlockTag != "" {
  77. bl.Title = tabres.BlockTag
  78. }
  79. if len(bl.TableKV.Kv) >0 {
  80. bl.Text = tabs[i].Text()
  81. job.Block = append(job.Block, bl)
  82. }
  83. }
  84. // for k, v := range bl.TableKV.Kv {
  85. // log.Println("bl.TableKV.Kv", k, v)
  86. // }
  87. } else {
  88. //从正文里面找分包
  89. job.BlockPackage = FindPackageFromText(job.Title, newCon)
  90. }
  91. //新加 未分块table中未能解析到中标候选人,从正文中解析
  92. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  93. bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
  94. }
  95. FindProjectCode(newCon, job) //匹配项目编号
  96. bl.Text = newCon
  97. //调用kv解析
  98. bl.ColonKV = GetKVAll(newCon, "", nil, 1)
  99. bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
  100. job.Block = append(job.Block, bl)
  101. }
  102. for _, v := range job.BlockPackage {
  103. block := &util.Block{}
  104. block.ColonKV = v.ColonKV
  105. block.TableKV = v.TableKV
  106. block.SpaceKV = v.SpaceKV
  107. block.Text = v.Text
  108. block.Winnerorder = v.WinnerOrder
  109. job.Block = append(job.Block, block)
  110. }
  111. }
  112. //匹配项目编号
  113. func FindProjectCode(newCon string, job *util.Job) {
  114. newCon = TextAfterRemoveTable(newCon)
  115. if strings.TrimSpace(newCon) == "" {
  116. return
  117. }
  118. var proCode string
  119. proCode = projectcodeReg.FindString(newCon)
  120. blCode := &util.Block{}
  121. blCode.Text = proCode
  122. if proCode != "" {
  123. ckv := GetKVAll(proCode, job.Title, nil, 1)
  124. blCode.ColonKV = ckv
  125. job.Block = append(job.Block, blCode)
  126. } else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
  127. ckv := GetKVAll(proCode, job.Title, nil, 1)
  128. blCode.ColonKV = ckv
  129. job.Block = append(job.Block, blCode)
  130. } else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
  131. ckv := GetKVAll(proCode, job.Title, nil, 1)
  132. blCode.ColonKV = ckv
  133. job.Block = append(job.Block, blCode)
  134. }
  135. if proCode = jsonReg.FindString(newCon); proCode != "" {
  136. jsonMap := make(map[string]string)
  137. json.Unmarshal([]byte(proCode), &jsonMap)
  138. jobKv := util.NewJobKv()
  139. for k, v := range jsonMap {
  140. tmpkv := new(util.Kv)
  141. tmpkv.Line = k + v
  142. tmpkv.Key = k
  143. tmpkv.Value = v
  144. jobKv.Kvs = append(jobKv.Kvs, tmpkv)
  145. }
  146. jobKv.Kv = jsonMap
  147. blCode.ColonKV = jobKv
  148. job.Block = append(job.Block, blCode)
  149. }
  150. }
  151. //分析table解析结果
  152. func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
  153. //解析结果中的kv
  154. kv := map[string]string{}
  155. for k, v := range tabres.SortKV.Map {
  156. kv[k] = qutil.ObjToString(v)
  157. }
  158. kvIndex := map[string]int{}
  159. for k, v := range tabres.SortKVWeight {
  160. kvIndex[k] = v
  161. }
  162. KvTag := map[string]*util.Tag{}
  163. for k, _ := range tabres.SortKV.NotTagKey {
  164. KvTag[k] = &util.Tag{Weight: util.RetainKvWeight}
  165. }
  166. block.TableKV = &util.JobKv{Kv: kv, KvIndex: kvIndex, KvTag: KvTag}
  167. //分包
  168. tablePackage := map[string]*util.BlockPackage{}
  169. if tabres.IsMultiPackage {
  170. //分包中的map
  171. for k, v := range tabres.PackageMap.Map {
  172. blockPackage, ok := v.(*util.BlockPackage)
  173. if !ok {
  174. continue
  175. }
  176. //解析kv
  177. //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
  178. labelKVs := []*util.Kv{}
  179. if blockPackage.TableKV != nil && blockPackage.TableKV.Kv != nil {
  180. for tk, tv := range blockPackage.TableKV.Kv {
  181. if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
  182. labelKVs = append(labelKVs, &util.Kv{
  183. Key: tk,
  184. Value: tv,
  185. })
  186. }
  187. }
  188. }
  189. labelKV, _ := KvTagsToKV(labelKVs, "", nil, 2)
  190. for lk, lv := range labelKV {
  191. if blockPackage.TableKV.Kv[lk] != "" {
  192. continue
  193. }
  194. blockPackage.TableKV.Kv[lk] = lv
  195. }
  196. tablePackage[k] = blockPackage
  197. }
  198. }
  199. //处理中标人排序
  200. wror := []map[string]interface{}{}
  201. for _, v := range tabres.WinnerOrder {
  202. entName, _ := v["entname"].(string)
  203. v["entname"] = winnerOrderEntity.clear("中标单位", entName)
  204. if price, ok := v["price"].(string); ok {
  205. v["price"] = winnerOrderEntity.clear("中标金额", price)
  206. }
  207. v["type"] = 2
  208. wror = append(wror, v)
  209. }
  210. if len(wror) > 0 {
  211. job.Winnerorder = wror
  212. }
  213. //分包
  214. if len(tablePackage) > 0 {
  215. pkgMap := map[string]*util.BlockPackage{}
  216. for tk, tv := range tablePackage {
  217. bv := job.BlockPackage[tk]
  218. if bv == nil {
  219. pkgMap[tk] = tv
  220. continue
  221. }
  222. bv.Text += "\n" + tv.Text
  223. /************table中的分包替换块里面找到的****************/
  224. //
  225. if tv.ColonKV != nil {
  226. if bv.ColonKV == nil {
  227. bv.ColonKV = util.NewJobKv()
  228. }
  229. for k, v := range tv.ColonKV.Kv {
  230. if bv.ColonKV.Kv[k] != "" {
  231. continue
  232. }
  233. bv.ColonKV.Kv[k] = v
  234. }
  235. }
  236. //
  237. if tv.TableKV != nil {
  238. if bv.TableKV == nil {
  239. bv.TableKV = util.NewJobKv()
  240. }
  241. for k, v := range tv.TableKV.Kv {
  242. if bv.TableKV.Kv[k] != "" {
  243. continue
  244. }
  245. bv.TableKV.Kv[k] = v
  246. }
  247. }
  248. //
  249. if tv.Origin != "" {
  250. bv.Origin = tv.Origin
  251. }
  252. //
  253. if tv.Index != "" {
  254. bv.Index = tv.Index
  255. }
  256. //
  257. if tv.Type != "" {
  258. bv.Type = tv.Type
  259. }
  260. //
  261. if tv.BidStatus != "" {
  262. bv.BidStatus = tv.BidStatus
  263. }
  264. //
  265. if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 {
  266. bv.WinnerOrder = tv.WinnerOrder
  267. }
  268. }
  269. for k, v := range pkgMap {
  270. job.BlockPackage[k] = v
  271. }
  272. }
  273. //增加brand
  274. if tabres.HasKey != 0 {
  275. job.HasKey = tabres.HasKey
  276. }
  277. if tabres.HasBrand != 0 {
  278. job.HasBrand = tabres.HasBrand
  279. }
  280. if tabres.HasGoods != 0 {
  281. job.HasGoods = tabres.HasGoods
  282. }
  283. job.HasGoods = tabres.HasGoods
  284. if len(tabres.BrandData) > 0 { //分块table合并
  285. for _, v := range tabres.BrandData {
  286. job.BrandData = append(job.BrandData, v) //加入job
  287. }
  288. }
  289. }
  290. //一行多列 一列多行,按照分块逻辑处理
  291. //ration==1 遍历所有tabs,ration!=1 tabs只有一个
  292. func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
  293. if len(tabs) != 1 {
  294. return "" //5c2aca5ea5cb26b9b7a8229b
  295. }
  296. for _, tab := range tabs {
  297. content := ""
  298. tbody := tab.ChildrenFiltered("tbody,thead")
  299. var tr *goquery.Selection
  300. if tbody.Length() == 1 {
  301. tr = tbody.ChildrenFiltered("tr")
  302. } else {
  303. tr = tab.ChildrenFiltered("tr")
  304. }
  305. if tr.Length() == 1 {
  306. tds := tr.ChildrenFiltered("td")
  307. tds.Each(func(index int, sn *goquery.Selection) {
  308. ret, _ := sn.Html()
  309. if strings.TrimSpace(ret) != "" {
  310. content += ret + "\n"
  311. }
  312. })
  313. } else {
  314. flag := true
  315. tr.EachWithBreak(func(index int, sn *goquery.Selection) bool {
  316. th := sn.ChildrenFiltered("th")
  317. td := sn.ChildrenFiltered("td")
  318. if th.Length() > 0 || td.Length() > 1 {
  319. flag = false
  320. return false
  321. } else if td.Length() == 1 {
  322. ret, _ := td.Html()
  323. if strings.TrimSpace(ret) != "" {
  324. content += ret + "\n"
  325. }
  326. }
  327. return true
  328. })
  329. if !flag {
  330. return ""
  331. }
  332. }
  333. if content != "" {
  334. content = regMoreWrap.ReplaceAllString(content, "\n")
  335. content = regEndWrap.ReplaceAllString(content, "")
  336. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  337. doc.Find("table").Eq(0).ReplaceWithHtml(content)
  338. con, _ = doc.Find("body").Html()
  339. }
  340. }
  341. return con
  342. }
  343. //查找大文本,5次
  344. func FindBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) {
  345. content = tableDivideBlock(con, r, t)
  346. if content == "" {
  347. return
  348. }
  349. for i := 0; i < 4; i++ {
  350. if content != "" {
  351. tabs, ration = ComputeConRatio(content, 1)
  352. if len(tabs) > 0 {
  353. con := tableDivideBlock(content, ration, tabs)
  354. if con == "" {
  355. return
  356. } else {
  357. content = con
  358. }
  359. } else {
  360. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  361. content = doc.Text()
  362. return
  363. }
  364. } else {
  365. return
  366. }
  367. }
  368. return
  369. }