analystep.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. /**
  2. 信息预处理入口
  3. **/
  4. package pretreated
  5. import (
  6. "encoding/json"
  7. "fmt"
  8. "jy/clear"
  9. "jy/util"
  10. "regexp"
  11. "strings"
  12. "github.com/PuerkitoBio/goquery"
  13. )
  14. var yjReg *regexp.Regexp = regexp.MustCompile("(打分表|负责人|单位|个人|投标人|项目|企业)业绩|主要人员相关资料|投标文件格式|唱标记录|否决投标的?情况说明")
  15. var hisReg = regexp.MustCompile("(开标记录|类似业绩|历史业绩|填报项目业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?(</td>)")
  16. var hisReg2 = regexp.MustCompile("(开标记录|业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?(</tr>|</table>|</td>)")
  17. var formattext = regexp.MustCompile("(投标总价)([0-9,.万元]*)")
  18. var formattext2 = regexp.MustCompile("中标单价.*(中标总价.*)")
  19. var formattext3 = regexp.MustCompile("(同类项目业绩、|[1-9].[0-9]包段划分)")
  20. func AnalyStart(job *util.Job, isSite bool, codeSite string) {
  21. con := job.Content
  22. //全文的需要修复表格
  23. con = RepairCon(con)
  24. //格式化正文 -断点
  25. con = formattext3.ReplaceAllString(con,"")
  26. con = hisReg.ReplaceAllString(con, "${2}")
  27. con = hisReg2.ReplaceAllString(con, "${2}")
  28. con = formattext.ReplaceAllString(con, "${1}:${2}")
  29. con = formattext2.ReplaceAllString(con, "${1}")
  30. con = formatText(con, "all")
  31. job.Content = con
  32. //计算表格占比,返回表格数组、占比
  33. tabs, _ := ComputeConRatio(con, 1)
  34. /*if len(tabs) > 0 {
  35. newcon, newtabs, newration := FindBigText(con, ration, tabs)
  36. if newcon != "" {
  37. con = newcon
  38. con = formatText(con, "all")
  39. tabs = newtabs
  40. ration = newration
  41. }
  42. }*/
  43. job.BlockPackage = map[string]*util.BlockPackage{}
  44. //分块+处理每块kv
  45. blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite)
  46. if len(blockArrays) > 0 { //有分块
  47. //从块里面找分包-文本
  48. if !job.IsFile {
  49. job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
  50. }
  51. for _, bl := range blockArrays {
  52. //log.Println(bl.Text)
  53. if len([]rune(bl.Text)) > 80 {
  54. bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock, isSite, codeSite)
  55. for _, bl_bl := range bl.Block {
  56. processTableInBlock(bl_bl, job, isSite, codeSite)
  57. }
  58. }
  59. FindProjectCode(bl.Text, job) //匹配项目编号
  60. processTableInBlock(bl, job, isSite, codeSite) //处理表格
  61. //新加 未分块table中未能解析到中标候选人,从正文中解析
  62. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  63. bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
  64. job.Winnerorder = bl.Winnerorder
  65. }
  66. job.Block = append(job.Block, bl)
  67. }
  68. } else { //未分块,创建分块
  69. //log.Println(con)
  70. bl := &util.Block{}
  71. newCon := con
  72. //log.Println(con)
  73. if len(tabs) > 0 { //解析表格逻辑
  74. job.HasTable = 1 //添加标识:文本中有table
  75. newCon = TextAfterRemoveTable(con)
  76. //log.Println(newCon)
  77. if newCon != "" {
  78. job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite)
  79. }
  80. for i := 0; i < len(tabs); i++ {
  81. blockTag := ""
  82. if len(tabs[i].Nodes) > 0 {
  83. if tabs[i].Nodes[0].PrevSibling != nil {
  84. blockTag = tabs[i].Nodes[0].PrevSibling.Data
  85. }
  86. }
  87. //添加标识:文本中有table
  88. //blockTag - 块标签
  89. //处理表格
  90. tabres := AnalyTableV2(tabs[i], job.Category, blockTag, con, 1, job.SourceMid, job.RuleBlock, isSite, codeSite) //解析表格入口 返回:汇总表格对象
  91. processTableResult(tabres, bl, job, isSite, codeSite)
  92. }
  93. } else {
  94. //从正文里面找分包
  95. job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite)
  96. }
  97. bl.Text = HtmlToText(con)
  98. //log.Println(bl.Text)
  99. FindProjectCode(bl.Text, job) //匹配项目编号
  100. if yjReg.MatchString(bl.Text) {
  101. if strings.Index(bl.Text, "业绩") > 1 {
  102. bl.Text = bl.Text[:strings.Index(bl.Text, "业绩")]
  103. }
  104. }
  105. //调用kv解析库-处理detail
  106. bl.Text = formatText(bl.Text, "all")
  107. //处理 :
  108. bl.ColonKV = GetKVAll(bl.Text, "", nil, 1, isSite, codeSite)
  109. //处理空格
  110. bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil, isSite, codeSite)
  111. //新加 未分块table中未能解析到中标候选人,从正文中解析
  112. if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
  113. bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
  114. job.Winnerorder = bl.Winnerorder
  115. }
  116. job.Block = append(job.Block, bl)
  117. }
  118. }
  119. func processTableInBlock(bl *util.Block, job *util.Job, isSite bool, codeSite string) {
  120. //块中再查找表格(块,处理完把值赋到块)
  121. //bl.Text = formatText(bl.Text, "biangeng")
  122. tabs, _ := ComputeConRatio(bl.Text, 2)
  123. for i, tab := range tabs {
  124. job.HasTable = 1
  125. tmptag := ""
  126. if i == 0 && bl.Title != "" && len(bl.Title) < 20 {
  127. tmptag = bl.Title
  128. } else if tab.Nodes[0] != nil && tab.Nodes[0].PrevSibling != nil {
  129. tmptag = strings.TrimSpace(tab.Nodes[0].PrevSibling.Data)
  130. }
  131. //添加标识:文本中有table
  132. tabres := AnalyTableV2(tab, job.Category, tmptag, tab.Text(), 2, job.SourceMid, job.RuleBlock, isSite, codeSite) //解析表格入口 返回:汇总表格对象
  133. //if packageFlag {
  134. // tabres.PackageMap = nil
  135. // tabres.IsMultiPackage = false
  136. //}
  137. processTableResult(tabres, bl, job, isSite, codeSite) //分析table解析结果
  138. if bl.Title == "" && tabres.BlockTag != "" {
  139. bl.Title = tabres.BlockTag
  140. }
  141. }
  142. }
  143. //匹配项目编号
  144. func FindProjectCode(newCon string, job *util.Job) {
  145. newCon = HtmlToText(newCon)
  146. if strings.TrimSpace(newCon) == "" {
  147. return
  148. }
  149. var proCode string
  150. blCode := &util.Block{}
  151. /* if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
  152. //5d424bdfa5cb26b9b7ac7a85
  153. //5d425a48a5cb26b9b7df5fec
  154. //5d425506a5cb26b9b7cd2c3c
  155. splitStr := strings.Split(newConTMP, " ")
  156. if len(splitStr) >= 2 {
  157. if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
  158. newCon = "项目编号:" + splitStr[len(splitStr)-1]
  159. } else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
  160. //5d4253f3a5cb26b9b7ca2662
  161. newCon = "项目编号:" + tmpstr
  162. }
  163. } else if len(splitStr) == 1 {
  164. if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
  165. newCon = "项目编号:" + tmpstr
  166. } else if strings.Contains(newConTMP, "、") {
  167. tmpstrs := strings.Split(newCon, "、")
  168. newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
  169. }
  170. }
  171. }
  172. proCode = projectcodeReg.FindString(newCon)
  173. if proCode != "" {
  174. ckv := GetKVAll(proCode, job.Title, nil, 1)
  175. blCode.ColonKV = ckv
  176. blCode.Text = proCode
  177. job.Block = append(job.Block, blCode)
  178. } else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
  179. ckv := GetKVAll(proCode, job.Title, nil, 1)
  180. blCode.ColonKV = ckv
  181. blCode.Text = proCode
  182. job.Block = append(job.Block, blCode)
  183. } else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
  184. ckv := GetKVAll(proCode, job.Title, nil, 1)
  185. blCode.Text = proCode
  186. blCode.ColonKV = ckv
  187. job.Block = append(job.Block, blCode)
  188. }*/
  189. if proCode = jsonReg.FindString(newCon); proCode != "" {
  190. jsonMap := make(map[string]string)
  191. json.Unmarshal([]byte(proCode), &jsonMap)
  192. jobKv := util.NewJobKv()
  193. kvTags := map[string][]*util.Tag{}
  194. for k, v := range jsonMap {
  195. kvTags[k] = append(kvTags[k], &util.Tag{Key: k, Value: v})
  196. tmpkv := new(util.Kv)
  197. tmpkv.Line = k + v
  198. tmpkv.Key = k
  199. tmpkv.Value = v
  200. jobKv.Kvs = append(jobKv.Kvs, tmpkv)
  201. }
  202. jobKv.KvTags = kvTags
  203. blCode.ColonKV = jobKv
  204. job.Block = append(job.Block, blCode)
  205. }
  206. }
  207. //分析table解析结果
  208. func processTableResult(tabres *TableResult, block *util.Block, job *util.Job, isSite bool, codeSite string) {
  209. //解析结果中的kv
  210. if block.TableKV == nil {
  211. block.TableKV = util.NewJobKv()
  212. }
  213. MergeKvTags(block.TableKV.KvTags, tabres.KvTags)
  214. isorderwiner := true
  215. //分包
  216. tablePackage := map[string]*util.BlockPackage{}
  217. if tabres.IsMultiPackage && !job.IsFile {
  218. //分包中的map
  219. for _, v := range tabres.PackageMap.Keys {
  220. blockPackage, ok := tabres.PackageMap.Map[v].(*util.BlockPackage)
  221. if !ok {
  222. continue
  223. }
  224. //解析kv
  225. //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
  226. labelKVs := []*util.Kv{}
  227. if blockPackage.TableKV != nil {
  228. for tk, tv := range blockPackage.TableKV.KvTags {
  229. for _, tvv := range tv {
  230. if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
  231. labelKVs = append(labelKVs, &util.Kv{
  232. Key: tk,
  233. Value: tvv.Value,
  234. })
  235. }
  236. }
  237. }
  238. } else {
  239. blockPackage.TableKV = util.NewJobKv()
  240. }
  241. MergeKvTags(blockPackage.TableKV.KvTags, GetKvTags(labelKVs, "", nil, isSite, codeSite))
  242. if blockPackage.WinnerOrder != nil && len(blockPackage.WinnerOrder) > 0 {
  243. for i, v := range blockPackage.WinnerOrder {
  244. if entName, ok := v["entname"].(string); ok {
  245. v["entname"] = winnerOrderEntity.clear("中标单位", entName)
  246. if i == 0 && blockPackage.Winner == "" {
  247. blockPackage.Winner = fmt.Sprint(v["entname"])
  248. }
  249. if price, ok := v["price"].(string); ok && len(price) < 30 && len(price) > 0 && !clearnum.MatchString(price) {
  250. v["price"] = winnerOrderEntity.clear("中标金额", price)
  251. if !blockPackage.IsTrueBidamount {
  252. moneys := clear.ObjToMoney([]interface{}{v["price"], ""}, job.SpiderCode, job.IsClearnMoney)
  253. if len(moneys) > 0 {
  254. if vf, ok := moneys[0].(float64); ok {
  255. blockPackage.Bidamount = vf
  256. blockPackage.IsTrueBidamount = moneys[len(moneys)-1].(bool)
  257. } else if vi, ok := moneys[0].(int); ok {
  258. blockPackage.Bidamount = float64(vi)
  259. blockPackage.IsTrueBidamount = moneys[len(moneys)-1].(bool)
  260. }
  261. }
  262. }
  263. }
  264. v["type"] = tabres.Toptype + "_" + tabres.BlockTag + "_" + blockPackage.Origin
  265. job.Winnerorder = append(job.Winnerorder, v)
  266. }
  267. }
  268. isorderwiner = false
  269. }
  270. tablePackage[v] = blockPackage
  271. }
  272. }
  273. //处理中标人排序
  274. if isorderwiner {
  275. tmpWins := make(map[string]int)
  276. for _, v := range job.Winnerorder {
  277. if v["entname"] != nil && v["entname"] != "" {
  278. tmpWins[v["entname"].(string)] = v["sort"].(int)
  279. }
  280. }
  281. wror := []map[string]interface{}{}
  282. if len(tmpWins) == 0 && len(tabres.WinnerOrder) > 0 {
  283. for _, v := range tabres.WinnerOrder {
  284. if entName, ok := v["entname"].(string); ok {
  285. v["entname"] = winnerOrderEntity.clear("中标单位", entName)
  286. if price, ok := v["price"].(string); ok {
  287. v["price"] = winnerOrderEntity.clear("中标金额", price)
  288. }
  289. v["type"] = tabres.Toptype + "_" + tabres.BlockTag
  290. wror = append(wror, v)
  291. }
  292. }
  293. } else {
  294. for _, v := range tabres.WinnerOrder {
  295. if entName, ok := v["entname"].(string); ok {
  296. v["entname"] = winnerOrderEntity.clear("中标单位", entName)
  297. if v["entname"] == "" {
  298. continue
  299. }
  300. if price, ok := v["price"].(string); ok {
  301. v["price"] = winnerOrderEntity.clear("中标金额", price)
  302. }
  303. v["type"] = tabres.Toptype + "_" + tabres.BlockTag
  304. if tmpWins[v["entname"].(string)] == v["sort"].(int) && v["price"] == nil {
  305. continue
  306. } else if tmpWins[v["entname"].(string)] != v["sort"].(int) && v["type"] != tabres.BlockTag {
  307. wror = append(wror, v)
  308. continue
  309. } else if tmpWins[v["entname"].(string)] > 0 && tmpWins[v["entname"].(string)] == v["sort"].(int) && v["price"] != nil {
  310. if tmpWins[v["entname"].(string)]-1 >= 0 && len(job.Winnerorder) > 0 {
  311. if len(job.Winnerorder) > (tmpWins[v["entname"].(string)] - 1) {
  312. job.Winnerorder[tmpWins[v["entname"].(string)]-1] = v
  313. }
  314. continue
  315. }
  316. }
  317. }
  318. }
  319. }
  320. if len(wror) > 0 {
  321. job.Winnerorder = append(job.Winnerorder, wror...)
  322. block.Winnerorder = job.Winnerorder
  323. }
  324. }
  325. //分包
  326. if len(tablePackage) > 0 && !job.IsFile {
  327. pkgMap := map[string]*util.BlockPackage{}
  328. for tk, tv := range tablePackage {
  329. bv := job.BlockPackage[tk]
  330. if bv == nil {
  331. pkgMap[tk] = tv
  332. continue
  333. }
  334. bv.Text += "\n" + tv.Text
  335. /************table中的分包替换块里面找到的****************/
  336. //
  337. if tv.ColonKV != nil {
  338. if bv.ColonKV == nil {
  339. bv.ColonKV = util.NewJobKv()
  340. }
  341. MergeKvTags(bv.ColonKV.KvTags, tv.ColonKV.KvTags)
  342. }
  343. //
  344. if tv.TableKV != nil {
  345. if bv.TableKV == nil {
  346. bv.TableKV = util.NewJobKv()
  347. }
  348. MergeKvTags(bv.TableKV.KvTags, tv.TableKV.KvTags)
  349. }
  350. //
  351. if tv.Origin != "" {
  352. bv.Origin = tv.Origin
  353. }
  354. //
  355. if tv.Index != "" {
  356. bv.Index = tv.Index
  357. }
  358. //
  359. if tv.Type != "" {
  360. bv.Type = tv.Type
  361. }
  362. //
  363. if tv.BidStatus != "" {
  364. bv.BidStatus = tv.BidStatus
  365. }
  366. //
  367. if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 {
  368. bv.WinnerOrder = tv.WinnerOrder
  369. }
  370. if tv.Bidamount >= 0 && tv.IsTrueBidamount {
  371. bv.Bidamount = tv.Bidamount
  372. bv.IsTrueBidamount = tv.IsTrueBidamount
  373. }
  374. if tv.Budget >= 0 && tv.IsTrueBudget {
  375. bv.Budget = tv.Budget
  376. bv.IsTrueBudget = tv.IsTrueBudget
  377. }
  378. }
  379. for k, v := range pkgMap {
  380. job.BlockPackage[k] = v
  381. }
  382. }
  383. //增加brand
  384. if tabres.HasKey != 0 {
  385. job.HasKey = tabres.HasKey
  386. }
  387. if tabres.HasBrand != 0 {
  388. job.HasBrand = tabres.HasBrand
  389. }
  390. if tabres.HasGoods != 0 {
  391. job.HasGoods = tabres.HasGoods
  392. }
  393. job.HasGoods = tabres.HasGoods
  394. if len(tabres.BrandData) > 0 { //分块table合并
  395. for _, v := range tabres.BrandData {
  396. job.BrandData = append(job.BrandData, v) //加入job
  397. }
  398. }
  399. //加入job
  400. if len(tabres.PriceNumberData) > 0 {
  401. for _, tabledata := range tabres.PriceNumberData { //校验重复的table对象
  402. job.PriceNumberData = append(job.PriceNumberData, tabledata)
  403. }
  404. }
  405. }
  406. //一行多列 一列多行,按照分块逻辑处理
  407. //ration==1 遍历所有tabs,ration!=1 tabs只有一个
  408. func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
  409. if len(tabs) != 1 {
  410. return "" //5c2aca5ea5cb26b9b7a8229b
  411. }
  412. for _, tab := range tabs {
  413. content := ""
  414. tbody := tab.ChildrenFiltered("tbody,thead")
  415. var tr *goquery.Selection
  416. if tbody.Length() == 1 {
  417. tr = tbody.ChildrenFiltered("tr")
  418. } else {
  419. tr = tab.ChildrenFiltered("tr")
  420. }
  421. if tr.Length() == 1 {
  422. tds := tr.ChildrenFiltered("td")
  423. tds.Each(func(index int, sn *goquery.Selection) {
  424. ret, _ := sn.Html()
  425. if strings.TrimSpace(ret) != "" {
  426. content += ret + "\n"
  427. }
  428. })
  429. } else {
  430. flag := true
  431. tr.EachWithBreak(func(index int, sn *goquery.Selection) bool {
  432. th := sn.ChildrenFiltered("th")
  433. td := sn.ChildrenFiltered("td")
  434. if th.Length() > 0 || td.Length() > 1 {
  435. flag = false
  436. return false
  437. } else if td.Length() == 1 {
  438. ret, _ := td.Html()
  439. if strings.TrimSpace(ret) != "" {
  440. content += ret + "\n"
  441. }
  442. }
  443. return true
  444. })
  445. if !flag {
  446. return ""
  447. }
  448. }
  449. if content != "" {
  450. content = regMoreWrap.ReplaceAllString(content, "\n")
  451. content = regEndWrap.ReplaceAllString(content, "")
  452. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  453. doc.Find("table").Eq(0).ReplaceWithHtml(content)
  454. con, _ = doc.Find("body").Html()
  455. }
  456. }
  457. return con
  458. }
  459. //查找大文本,5次
  460. func FindBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) {
  461. content = tableDivideBlock(con, r, t)
  462. if content == "" {
  463. return
  464. }
  465. for i := 0; i < 4; i++ {
  466. if content != "" {
  467. tabs, ration = ComputeConRatio(content, 1)
  468. if len(tabs) > 0 {
  469. con := tableDivideBlock(content, ration, tabs)
  470. if con == "" {
  471. return
  472. } else {
  473. content = con
  474. }
  475. } else {
  476. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  477. content = doc.Text()
  478. return
  479. }
  480. } else {
  481. return
  482. }
  483. }
  484. return
  485. }