task.go 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. package main
  2. import (
  3. "encoding/json"
  4. "log"
  5. "qfw/util"
  6. "regexp"
  7. //"strings"
  8. "time"
  9. )
  10. const (
  11. InitMinTime = int64(1325347200) //最小时间位置2012
  12. )
  13. //全量合并
  14. func taskQl(udpInfo map[string]interface{}) {
  15. defer util.Catch()
  16. //1、检查pubilshtime索引
  17. db, _ := udpInfo["db"].(string)
  18. if db == "" {
  19. db = MongoTool.DbName
  20. }
  21. coll, _ := udpInfo["coll"].(string)
  22. if coll == "" {
  23. coll = ExtractColl
  24. }
  25. sess := MongoTool.GetMgoConn()
  26. bcon := false
  27. if sess.DB(db).C(coll).EnsureIndexKey("publishtime_1", "publishtime_-1") == nil {
  28. bcon = true
  29. } else {
  30. log.Println("publishtime_1索引不存在")
  31. }
  32. MongoTool.DestoryMongoConn(sess)
  33. thread := util.IntAllDef(udpInfo["thread"], 1)
  34. if bcon {
  35. //获取起始时间
  36. startTime, END := int64(0), int64(0)
  37. sts, bres := MongoTool.Find(ExtractColl, `{}`, "publishtime", `{"publishtime":1}`, true, 0, 1)
  38. if bres && sts != nil && len(*sts) == 1 {
  39. startTime = util.Int64All((*sts)[0]["publishtime"])
  40. sts, bres = MongoTool.Find(ExtractColl, `{}`, "-publishtime", `{"publishtime":1}`, true, 0, 1)
  41. if bres && sts != nil && len(*sts) == 1 {
  42. END = util.Int64All((*sts)[0]["publishtime"])
  43. }
  44. log.Println("查询到的起始时间", startTime, END)
  45. } else {
  46. return
  47. }
  48. startTime -= 1
  49. sum := 0
  50. if startTime < InitMinTime {
  51. q := map[string]interface{}{
  52. "publishtime": map[string]interface{}{
  53. "$gt": startTime,
  54. "$lte": InitMinTime,
  55. },
  56. }
  57. sum = Mql(q, thread, db, coll, sum)
  58. startTime = InitMinTime
  59. }
  60. for {
  61. if startTime >= END {
  62. break
  63. }
  64. et := startTime + 50*86400
  65. if et >= END {
  66. et = END
  67. }
  68. q := map[string]interface{}{
  69. "publishtime": map[string]interface{}{
  70. "$gt": startTime,
  71. "$lte": et,
  72. },
  73. }
  74. sum = Mql(q, thread, db, coll, sum)
  75. startTime = et
  76. time.Sleep(1 * time.Second)
  77. }
  78. }
  79. log.Println("task over!!!")
  80. }
  81. func Mql(q map[string]interface{}, thread int, db, coll string, sum int) int {
  82. defer util.Catch()
  83. sess := MongoTool.GetMgoConn()
  84. defer MongoTool.DestoryMongoConn(sess)
  85. query := sess.DB(db).C(coll).Find(q).Sort("publishtime").Iter()
  86. pool := make(chan bool, thread)
  87. count := 0
  88. for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
  89. info := ParseInfo(tmp)
  90. if info != nil && !((info.pnbval == 1 && info.Buyer != "") || info.pnbval == 0) {
  91. pool <- true
  92. go func(info *Info, tmp map[string]interface{}) {
  93. defer func() {
  94. <-pool
  95. }()
  96. startProjectMerge(info, tmp)
  97. }(info, tmp)
  98. } else {
  99. //log.Println("info err:", tmp["_id"], tmp["title"], tmp["buyer"])
  100. }
  101. if sum%1000 == 0 {
  102. log.Println("current", sum)
  103. }
  104. sum++
  105. tmp = make(map[string]interface{})
  106. }
  107. //阻塞
  108. for n := 0; n < thread; n++ {
  109. pool <- true
  110. }
  111. //完成
  112. log.Println("sontask over:", count, sum, q)
  113. return sum
  114. }
  115. var (
  116. titleGetPc = regexp.MustCompile("^([-0-9a-zA-Z第号采招政询电审竞#]{8,}[-0-9a-zA-Z#]+)")
  117. titleGetPc1 = regexp.MustCompile("[\\[【((](.{0,6}(编号|编码|项号|包号|代码|标段?号)[::为])?([-0-9a-zA-Z第号采招政询电审竞#]{5,}([\\[\\]()()][-0-9a-zA-Z第号采招审竞#]+[\\[\\]()()][-0-9a-zA-Z第号采招审竞#]+)?)[\\]】))]")
  118. titleGetPc2 = regexp.MustCompile("([-0-9a-zA-Z第号采政招询电审竞#]{8,}[-0-9a-zA-Z#]+)(.{0,5}公告)?$")
  119. pcReplace = regexp.MustCompile("([\\[【((〖〔《{﹝{](重|第?[二三四再]次.{0,4})[\\]】))〗〕》}﹞}])$|[\\[\\]【】()()〖〗〔〕《》{}﹝﹞-{}–  ]+|(号|重|第?[二三四五再]次(招标)?)$")
  120. )
  121. func ParseInfo(tmp map[string]interface{}) (info *Info) {
  122. bys, _ := json.Marshal(tmp)
  123. var thisinfo *Info
  124. json.Unmarshal(bys, &thisinfo)
  125. if thisinfo == nil {
  126. return nil
  127. }
  128. if len(thisinfo.Topscopeclass) == 0 {
  129. thisinfo.Topscopeclass = []string{}
  130. }
  131. if len(thisinfo.Subscopeclass) == 0 {
  132. thisinfo.Subscopeclass = []string{}
  133. }
  134. res := titleGetPc.FindStringSubmatch(thisinfo.Title)
  135. if len(res) > 1 && len(res[1]) > 6 && thisinfo.ProjectCode != res[1] && !numCheckPc.MatchString(res[1]) && !_zimureg1.MatchString(res[1]) {
  136. thisinfo.PTC = res[1]
  137. thisinfo.pnbval++
  138. } else {
  139. res = titleGetPc1.FindStringSubmatch(thisinfo.Title)
  140. if len(res) > 3 && len(res[3]) > 6 && thisinfo.ProjectCode != res[3] && !numCheckPc.MatchString(res[3]) && !_zimureg1.MatchString(res[3]) {
  141. thisinfo.PTC = res[3]
  142. thisinfo.pnbval++
  143. } else {
  144. res = titleGetPc2.FindStringSubmatch(thisinfo.Title)
  145. if len(res) > 1 && len(res[1]) > 6 && thisinfo.ProjectCode != res[1] && !numCheckPc.MatchString(res[1]) && !_zimureg1.MatchString(res[1]) {
  146. thisinfo.PTC = res[1]
  147. thisinfo.pnbval++
  148. }
  149. }
  150. }
  151. if thisinfo.ProjectName != "" && len([]rune(thisinfo.ProjectName)) > 0 {
  152. // thisinfo.ProjectName = strings.Replace(thisinfo.ProjectName, "(", "(", -1)
  153. // thisinfo.ProjectName = strings.Replace(thisinfo.ProjectName, ")", ")", -1)
  154. // thisinfo.ProjectName = strings.Replace(thisinfo.ProjectName, "-", "", -1)
  155. thisinfo.ProjectName = pcReplace.ReplaceAllString(thisinfo.ProjectName, "")
  156. if thisinfo.ProjectName != "" {
  157. thisinfo.pnbval++
  158. }
  159. }
  160. if thisinfo.ProjectCode != "" || thisinfo.PTC != "" {
  161. if thisinfo.ProjectCode != "" {
  162. thisinfo.ProjectCode = pcReplace.ReplaceAllString(thisinfo.ProjectCode, "")
  163. // thisinfo.ProjectCode = strings.Replace(thisinfo.ProjectCode, "(", "(", -1)
  164. // thisinfo.ProjectCode = strings.Replace(thisinfo.ProjectCode, ")", ")", -1)
  165. // thisinfo.ProjectCode = strings.Replace(thisinfo.ProjectCode, "-", "", -1)
  166. } else {
  167. thisinfo.PTC = pcReplace.ReplaceAllString(thisinfo.PTC, "")
  168. // thisinfo.PTC = strings.Replace(thisinfo.PTC, "(", "(", -1)
  169. // thisinfo.PTC = strings.Replace(thisinfo.PTC, ")", ")", -1)
  170. // thisinfo.PTC = strings.Replace(thisinfo.PTC, "-", "", -1)
  171. }
  172. if thisinfo.ProjectCode != "" || thisinfo.PTC != "" {
  173. thisinfo.pnbval++
  174. }
  175. }
  176. if thisinfo.ProjectCode == thisinfo.PTC {
  177. thisinfo.PTC = ""
  178. }
  179. if thisinfo.Buyer != "" && len([]rune(thisinfo.Buyer)) > 2 {
  180. thisinfo.pnbval++
  181. } else {
  182. thisinfo.Buyer = ""
  183. }
  184. thisinfo.LenPC = len([]rune(thisinfo.ProjectCode))
  185. thisinfo.LenPTC = len([]rune(thisinfo.PTC))
  186. thisinfo.LenPN = len([]rune(thisinfo.ProjectName))
  187. return thisinfo
  188. }