extractudp.go 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. // extractudp
  2. package extract
  3. import (
  4. "encoding/json"
  5. "fmt"
  6. db "jy/mongodbutil"
  7. ju "jy/util"
  8. mu "mfw/util"
  9. "net"
  10. qu "qfw/util"
  11. "sync"
  12. log "github.com/donnie4w/go-logger/logger"
  13. "gopkg.in/mgo.v2/bson"
  14. )
  15. var Udpclient mu.UdpClient //udp对象
  16. var nextNodes []map[string]interface{}
  17. //udp通知抽取
  18. func ExtractUdp() {
  19. nextNodes = qu.ObjArrToMapArr(ju.Config["nextNode"].([]interface{}))
  20. Udpclient = mu.UdpClient{Local: ":" + qu.ObjToString(ju.Config["udpport"]), BufSize: 1024}
  21. Udpclient.Listen(processUdpMsg)
  22. }
  23. func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
  24. switch act {
  25. case mu.OP_TYPE_DATA:
  26. var rep map[string]interface{}
  27. err := json.Unmarshal(data, &rep)
  28. if err != nil {
  29. log.Debug(err)
  30. } else {
  31. sid, _ := rep["gtid"].(string)
  32. eid, _ := rep["lteid"].(string)
  33. stype, _ := rep["stype"].(string)
  34. if sid == "" || eid == "" {
  35. log.Debug("err", "sid=", sid, "eid=", eid)
  36. } else {
  37. go Udpclient.WriteUdp([]byte("udpok"), mu.OP_NOOP, ra)
  38. if stype == "distributed" { //分布式抽取分支
  39. log.Debug("分布式抽取id段", sid, eid)
  40. InstanceId := qu.ObjToString(rep["InstanceId"])
  41. db.Mgo.Update("ecs", `{"InstanceId":"`+InstanceId+`"}`,
  42. map[string]interface{}{
  43. "$set": map[string]interface{}{
  44. "extstatus": "running",
  45. },
  46. }, true, false)
  47. ExtractByUdp(sid, eid, qu.ObjToString(rep["InstanceId"]))
  48. db.Mgo.Update("ecs", `{"InstanceId":"`+InstanceId+`"}`,
  49. map[string]interface{}{
  50. "$set": map[string]interface{}{
  51. "extstatus": "ok",
  52. },
  53. }, true, false)
  54. log.Debug("分布式抽取完成", sid, eid, "释放esc实例", qu.ObjToString(rep["ip"]))
  55. } else {
  56. log.Debug("udp通知抽取id段", sid, eid)
  57. ExtractByUdp(sid, eid)
  58. log.Debug("udp通知抽取完成,eid=", eid)
  59. for _, m := range nextNodes {
  60. by, _ := json.Marshal(map[string]interface{}{
  61. "gtid": sid,
  62. "lteid": eid,
  63. "stype": qu.ObjToString(m["stype"]),
  64. })
  65. err := Udpclient.WriteUdp(by, mu.OP_TYPE_DATA, &net.UDPAddr{
  66. IP: net.ParseIP(m["addr"].(string)),
  67. Port: qu.IntAll(m["port"]),
  68. })
  69. if err != nil {
  70. log.Debug(err)
  71. }
  72. }
  73. }
  74. }
  75. }
  76. case mu.OP_NOOP: //下个节点回应
  77. var rep map[string]interface{}
  78. err := json.Unmarshal(data, &rep)
  79. if err != nil {
  80. log.Debug(err)
  81. } else {
  82. log.Debug(rep)
  83. }
  84. }
  85. }
  86. var ext *ExtractTask
  87. //根据id区间抽取
  88. func ExtractByUdp(sid, eid string, instanceId ...string) {
  89. defer qu.Catch()
  90. if ext == nil {
  91. ext = &ExtractTask{}
  92. ext.Id = qu.ObjToString(ju.Config["udptaskid"])
  93. ext.InitTaskInfo()
  94. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  95. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  96. ext.InitRulePres()
  97. ext.InitRuleBacks()
  98. ext.InitRuleCore()
  99. ext.InitTag()
  100. ext.InitClearFn()
  101. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  102. //初始化城市DFA信息
  103. ext.InitDFA()
  104. }
  105. //质量审核
  106. ext.InitAuditFields()
  107. ext.InitAuditRule()
  108. ext.InitAuditClass()
  109. ext.InitAuditRecogField()
  110. //品牌抽取是否开启
  111. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  112. ext.ResultSave(true)
  113. ext.BidSave(true)
  114. ext.IsRun = true
  115. }
  116. if len(instanceId) > 0 { //分布式抽取进度
  117. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
  118. count1 := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  119. count2 := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl+"_back", query)
  120. count := count1 + count2
  121. pageNum := (count + PageSize - 1) / PageSize
  122. limit := PageSize
  123. if count < PageSize {
  124. limit = count
  125. }
  126. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  127. startI := 0 //接着上次任务执行
  128. sidback := sid
  129. esc, _ := db.Mgo.FindOne("ecs", `{"InstanceId":"`+instanceId[0]+`"}`)
  130. startI = qu.IntAll((*esc)["pagecurrent"])
  131. if qu.ObjToString((*esc)["lastId"]) != "" {
  132. sid = qu.ObjToString((*esc)["lastId"])
  133. }
  134. if qu.ObjToString((*esc)["lastIdback"]) != "" {
  135. sidback = qu.ObjToString((*esc)["lastIdback"])
  136. }
  137. for i := startI; i < pageNum; i++ {
  138. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
  139. fmt.Printf("page=%d,query=%v", i+1, query)
  140. if ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query) > 0 {
  141. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  142. for _, v := range *list {
  143. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  144. continue
  145. }
  146. _id := qu.BsonIdToSId(v["_id"])
  147. log.Debug(_id)
  148. var j, jf *ju.Job
  149. if ext.IsFileField && v["projectinfo"] != nil {
  150. v["isextFile"] = true
  151. j, jf = PreInfo(v)
  152. } else {
  153. j, _ = PreInfo(v)
  154. }
  155. ext.TaskInfo.ProcessPool <- true
  156. go ext.ExtractProcess(j, jf)
  157. sid = _id
  158. }
  159. db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
  160. map[string]interface{}{"$set": map[string]interface{}{
  161. "lastId": sid,
  162. }}, true, false)
  163. }
  164. queryback := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sidback)}}
  165. fmt.Printf("page=%d,queryback=%v", i+1, queryback)
  166. if ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl+"_back", queryback) > 0 {
  167. list2, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl+"_back", queryback, nil, Fields, false, 0, limit)
  168. for _, v := range *list2 {
  169. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  170. continue
  171. }
  172. _id := qu.BsonIdToSId(v["_id"])
  173. log.Debug(_id)
  174. var j, jf *ju.Job
  175. if ext.IsFileField && v["projectinfo"] != nil {
  176. v["isextFile"] = true
  177. j, jf = PreInfo(v)
  178. } else {
  179. j, _ = PreInfo(v)
  180. }
  181. ext.TaskInfo.ProcessPool <- true
  182. go ext.ExtractProcess(j, jf)
  183. sidback = _id
  184. }
  185. db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
  186. map[string]interface{}{"$set": map[string]interface{}{
  187. "lastIdback": sidback,
  188. }}, true, false)
  189. }
  190. db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
  191. map[string]interface{}{"$set": map[string]interface{}{
  192. "pagetotal": pageNum,
  193. "pagecurrent": i + 1,
  194. }}, true, false)
  195. }
  196. } else { //普通抽取
  197. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
  198. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  199. pageNum := (count + PageSize - 1) / PageSize
  200. limit := PageSize
  201. if count < PageSize {
  202. limit = count
  203. }
  204. wg := sync.WaitGroup{}
  205. for i := 0; i < pageNum; i++ {
  206. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid)}}
  207. fmt.Printf("page=%d,query=%v", i+1, query)
  208. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  209. for k, v := range *list {
  210. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  211. continue
  212. }
  213. _id := qu.BsonIdToSId(v["_id"])
  214. var j, jf *ju.Job
  215. if ext.IsFileField && v["projectinfo"] != nil {
  216. v["isextFile"] = true
  217. j, jf = PreInfo(v)
  218. } else {
  219. j, _ = PreInfo(v)
  220. }
  221. ext.TaskInfo.ProcessPool <- true
  222. wg.Add(1)
  223. go func() {
  224. defer wg.Done()
  225. ext.ExtractProcess(j, jf)
  226. }()
  227. if k%1000 == 0 {
  228. log.Debug(i, k, _id)
  229. }
  230. sid = _id
  231. }
  232. }
  233. wg.Wait()
  234. ext.BidSave(false)
  235. }
  236. }