main_blocktest.go 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. package main
  2. import (
  3. "fmt"
  4. "jy/extract"
  5. "jy/mongodbutil"
  6. "jy/pretreated"
  7. ju "jy/util"
  8. "log"
  9. "os"
  10. qu "qfw/util"
  11. "regexp"
  12. "time"
  13. )
  14. var f *os.File
  15. var m = map[string]bool{}
  16. func main12() {
  17. //winnerorder()
  18. //return
  19. //log.Println(pretreated.ProcTitle("以上公告内容如有变动将在相关网络媒体上另行通知凡购买本招标文件的单位必须就此采购项目的相关事宜详细咨询否则参与投标即被视为已经充分了解了招标方的需求中标后承担该文件范围内的所有要求投标前如对招标文件存有疑问请在投标截止日期前三个工作日以实名制书面文件向我公司询问否则视为接受已报名购买招标文件的投标商未递交投标文件或虽递交投标文件但未参加开标大会的投标商不得再参加该项目的采购活动"))
  20. //return
  21. //f, _ = os.OpenFile("./title.txt", os.O_RDWR|os.O_CREATE, 777)
  22. //all()
  23. one()
  24. }
  25. func all() {
  26. m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27082", "extract_kf")
  27. sess := m.Get()
  28. defer m.Close(sess)
  29. it := sess.DB("extract_kf").C("bidding201901").Find(nil).Iter()
  30. pool := make(chan bool, 5)
  31. count := 0
  32. for temp := make(map[string]interface{}); it.Next(&temp); {
  33. pool <- true
  34. count++
  35. go func(d map[string]interface{}) {
  36. defer func() {
  37. <-pool
  38. }()
  39. com(d)
  40. }(temp)
  41. temp = make(map[string]interface{})
  42. if count%200 == 0 {
  43. log.Println(count)
  44. }
  45. }
  46. log.Println("over...")
  47. time.Sleep(time.Hour)
  48. }
  49. func one() {
  50. m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
  51. d, _ := m.FindById("bidding", "5d424df7a5cb26b9b7b61fde", extract.Fields)
  52. com(*d)
  53. }
  54. func com(doc map[string]interface{}) {
  55. detail := GetDetail(doc)
  56. doc["detail"] = detail
  57. toptype := qu.ObjToString(doc["toptype"])
  58. subtype := qu.ObjToString(doc["subtype"])
  59. if qu.ObjToString(doc["type"]) == "bid" {
  60. toptype = "结果"
  61. }
  62. if toptype == "" {
  63. toptype = "*"
  64. }
  65. e := &extract.ExtractTask{
  66. TaskInfo: &extract.TaskInfo{
  67. Version: "V3.1.2",
  68. VersionId: "5cdd1c70e138234848c1d703",
  69. ProcessPool: make(chan bool, 1),
  70. },
  71. }
  72. e.Id = qu.ObjToString(ju.Config["udptaskid"])
  73. e.InitTaskInfo()
  74. //d.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  75. //d.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  76. e.InitRulePres()
  77. e.InitRuleBacks()
  78. e.InitRuleCore()
  79. e.InitBlockRule()
  80. e.InitTag()
  81. e.InitClearFn()
  82. if e.IsExtractCity { //版本上控制是否开始城市抽取
  83. //初始化城市DFA信息
  84. e.InitCityDFA()
  85. e.InitAreaCode()
  86. e.InitPostCode()
  87. }
  88. //质量审核
  89. e.InitAuditFields()
  90. e.InitAuditRule()
  91. e.InitAuditClass()
  92. e.InitAuditRecogField()
  93. //品牌抽取是否开启
  94. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  95. j := &ju.Job{
  96. SourceMid: qu.BsonIdToSId(doc["_id"]),
  97. Category: toptype,
  98. CategorySecond: subtype,
  99. Content: qu.ObjToString(doc["detail"]),
  100. SpiderCode: qu.ObjToString(doc["spidercode"]),
  101. //Domain: qu.ObjToString(doc["domain"]),
  102. //Href: qu.ObjToString(doc["href"]),
  103. Title: qu.ObjToString(doc["title"]),
  104. Data: &doc,
  105. City: qu.ObjToString(doc["city"]),
  106. Province: qu.ObjToString(doc["area"]),
  107. Result: map[string][]*ju.ExtField{},
  108. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  109. RuleBlock: e.RuleBlock,
  110. }
  111. e.TaskInfo.ProcessPool <- true
  112. pretreated.AnalyStart(j)
  113. e.ExtractProcess(j, nil)
  114. log.Println("=============块信息================")
  115. for _, v := range j.Block {
  116. log.Println("----", v.Title, v.Titles, "----")
  117. if v.ColonKV != nil {
  118. for kk, vv := range v.ColonKV.KvTags {
  119. for _, vvv := range vv {
  120. log.Println("ColonKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
  121. }
  122. }
  123. }
  124. if v.SpaceKV != nil {
  125. for kk, vv := range v.SpaceKV.KvTags {
  126. for _, vvv := range vv {
  127. log.Println("SpaceKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
  128. }
  129. }
  130. }
  131. if v.TableKV != nil {
  132. for kk, vv := range v.TableKV.KvTags {
  133. for _, vvv := range vv {
  134. log.Println("TableKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
  135. }
  136. }
  137. }
  138. //log.Println("Classify", v.Classify)
  139. //log.Println("Tag", v.Tag)
  140. }
  141. log.Println("=============抽取结果================")
  142. set := (e.ResultArr[0][1]["$set"]).(map[string]interface{})
  143. for k, v := range set {
  144. if k == "budget" || k == "bidamount" || k == "winner" || k == "amount" || k == "projectname" || k == "projectcode" || k == "buyer" || k == "buyerperson" || k == "buyertel" || k == "agency" {
  145. log.Println(k, "---", v)
  146. }
  147. }
  148. log.Println("=============抽取结果 result================")
  149. for k, v := range set["result"].(map[string][]*ju.ExtField) {
  150. if k != "winner" {
  151. continue
  152. }
  153. for _, vv := range v {
  154. log.Println(k, fmt.Sprintf("%+v", vv))
  155. for kkk, vvv := range vv.ScoreItem {
  156. log.Println("--", kkk, k, fmt.Sprintf("%+v", vvv))
  157. }
  158. log.Println("\n")
  159. }
  160. }
  161. log.Println("=============中标候选人================")
  162. for _, v := range j.Winnerorder {
  163. log.Println(v)
  164. }
  165. log.Println("=============分包================")
  166. for k, v := range j.BlockPackage {
  167. log.Println(k, v)
  168. }
  169. log.Println("=============正文================")
  170. //log.Println(j.Content)
  171. return
  172. for _, v := range j.Block {
  173. if v.ColonKV != nil && v.ColonKV.KvTags != nil {
  174. for kk, vv := range v.ColonKV.KvTags {
  175. for _, vvv := range vv {
  176. log.Println(kk, vvv.Weight, vvv.Value)
  177. }
  178. }
  179. }
  180. if v.TableKV != nil && v.TableKV.KvTags != nil {
  181. for kk, vv := range v.TableKV.KvTags {
  182. for _, vvv := range vv {
  183. log.Println(kk, vvv.Weight, vvv.Value)
  184. }
  185. }
  186. }
  187. if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
  188. for kk, vv := range v.SpaceKV.KvTags {
  189. for _, vvv := range vv {
  190. log.Println(kk, vvv.Weight, vvv.Value)
  191. }
  192. }
  193. }
  194. }
  195. log.Println(len(j.Block))
  196. return
  197. for _, v := range j.Block {
  198. if m[v.Title] || v.Title == "" {
  199. continue
  200. }
  201. if !regexp.MustCompile("或|和|以?及|与|、|或").MatchString(v.Title) {
  202. //continue
  203. }
  204. m[v.Title] = true
  205. f.WriteString(j.SourceMid + "-----" + v.Title + "---" + fmt.Sprint(v.Titles) + "\n")
  206. continue
  207. for _, kv := range v.ColonKV.Kvs {
  208. log.Println("\n")
  209. log.Println(kv.Key, "---", kv.Value)
  210. log.Println(kv.Line)
  211. log.Println("=======================")
  212. }
  213. }
  214. }
  215. func GetDetail(doc map[string]interface{}) (detail string) {
  216. detail = ""
  217. d1, _ := doc["detail"].(string)
  218. d2, _ := doc["contenthtml"].(string)
  219. if len(d1) >= len(d2) || d2 == "" {
  220. detail = d1
  221. } else {
  222. detail = d2
  223. }
  224. detail = ju.CutLableStr(detail)
  225. detail = ju.NewCut().ClearHtml(detail)
  226. tabs, ration := pretreated.ComputeConRatio(detail, 1)
  227. if len(tabs) > 0 {
  228. newcon, newtabs, newration := pretreated.FindBigText(detail, ration, tabs)
  229. //log.Println(newcon, newtabs, newration)
  230. if newcon != "" && newration == 0 {
  231. detail = newcon
  232. tabs = newtabs
  233. ration = newration
  234. }
  235. }
  236. return detail
  237. }
  238. func winnerorder() {
  239. text := `评审专家名单:
  240. 吴殿波、韩屹、孙胜进、郑丹、李海波
  241. 中标标的名称、规格型号、数量、单价、服务要求:
  242. 2019年沈阳惠涌供热有限责任公司、沈阳圣达热力供暖有限责任公司、沈阳惠盛供热有限责任公司PE管保温
  243. 第一入围供货商:沈阳曲暖鼎盛保温安装有限公司 、总单价:11.833300
  244. 第二入围供货商:沈阳国盛防腐保温有限公司、总单价:11.102100
  245. 第三入围供货商:沈阳泰豪管材有限公司、总单价:13.258100`
  246. log.Println((&pretreated.WinnerOrderEntity{}).Find(text, true, 1))
  247. }