biddingindex.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. package main
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "log"
  6. mu "mfw/util"
  7. "net"
  8. qutil "qfw/util"
  9. elastic "qfw/util/elastic"
  10. "regexp"
  11. "strings"
  12. "sync"
  13. "time"
  14. "gopkg.in/mgo.v2/bson"
  15. )
  16. //对字段处理 bidamount budget
  17. //招标数据表和抽取表一一对应开始更新
  18. func biddingTask(data []byte, mapInfo map[string]interface{}) {
  19. defer qutil.Catch()
  20. q, _ := mapInfo["query"].(map[string]interface{})
  21. bkey, _ := mapInfo["bkey"].(string)
  22. if q == nil {
  23. q = map[string]interface{}{
  24. "_id": bson.M{
  25. "$gt": qutil.StringTOBsonId(mapInfo["gtid"].(string)),
  26. "$lte": qutil.StringTOBsonId(mapInfo["lteid"].(string)),
  27. },
  28. }
  29. }
  30. //连接信息
  31. c, _ := bidding["collect"].(string)
  32. extractc, _ := bidding["extractcollect"].(string)
  33. db, _ := bidding["db"].(string)
  34. extractdb, _ := bidding["extractdb"].(string)
  35. index, _ := bidding["index"].(string)
  36. itype, _ := bidding["type"].(string)
  37. //extract库
  38. extractsession := extractmgo.GetMgoConn(86400)
  39. defer extractmgo.DestoryMongoConn(extractsession)
  40. extractquery := extractsession.DB(extractdb).C(extractc).Find(q).Sort("_id").Iter()
  41. eMap := map[string]map[string]interface{}{}
  42. for tmp := make(map[string]interface{}); extractquery.Next(tmp); {
  43. tid := qutil.BsonIdToSId(tmp["_id"])
  44. eMap[tid] = tmp
  45. tmp = make(map[string]interface{})
  46. }
  47. //bidding库
  48. session := mgo.GetMgoConn(86400)
  49. count, _ := session.DB(db).C(c).Find(&q).Count()
  50. log.Println("查询语句:", q, "同步总数:", count, "elastic库:", index)
  51. n1, n2 := 0, 0
  52. if count < 200000 {
  53. res := make([]map[string]interface{}, 1)
  54. session.DB(db).C(c).Find(q).Select(bson.M{
  55. "projectinfo.attachment": 0,
  56. "contenthtml": 0,
  57. }).All(&res)
  58. mgo.DestoryMongoConn(session)
  59. if len(res) != count {
  60. log.Println("查询结果不一致", "count:", count, "res:", len(res))
  61. time.Sleep(20 * time.Second)
  62. toadd := &net.UDPAddr{
  63. IP: net.ParseIP("127.0.0.1"),
  64. Port: qutil.IntAll(Sysconfig["udpport"]),
  65. }
  66. udpclient.WriteUdp(data, mu.OP_TYPE_DATA, toadd)
  67. } else {
  68. n1, n2 = doIndex(res, eMap, index, itype, db, c, bkey)
  69. if (n1 + n2) != count {
  70. log.Println("任务错误,结果不一致")
  71. }
  72. }
  73. } else {
  74. log.Println("数据量太大,放弃!", count)
  75. mgo.DestoryMongoConn(session)
  76. }
  77. log.Println(mapInfo, "create bidding index...over", "all:", count, "n1:", n1, "n2:", n2)
  78. }
  79. func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interface{}, index, itype, db, c, bkey string) (int, int) {
  80. n1, n2 := 0, 0
  81. //线程池
  82. UpdatesLock := sync.Mutex{}
  83. fields := strings.Split(bidding["fields"].(string), ",")
  84. //更新数组
  85. arr := [][]map[string]interface{}{}
  86. arrEs := []map[string]interface{}{}
  87. //对比两张表数据,减少查询次数
  88. var compare bson.M
  89. log.Println("开始迭代..")
  90. for n, tmp := range infos {
  91. n1++
  92. update := map[string]interface{}{}
  93. //对比方法----------------
  94. tid := qutil.BsonIdToSId(tmp["_id"])
  95. if eMap[tid] != nil {
  96. compare = eMap[tid]
  97. delete(eMap, tid)
  98. //更新bidding表,生成索引
  99. for _, k := range fields {
  100. v1 := compare[k]
  101. v2 := tmp[k]
  102. if v2 == nil && v1 != nil {
  103. update[k] = v1
  104. } else if v2 != nil && v1 != nil {
  105. //update[k+"_b"] = v2
  106. update[k] = v1
  107. } else if v2 != nil && v1 == nil {
  108. //update[k+"_b"] = v2
  109. }
  110. }
  111. if qutil.IntAll(compare["repeat"]) == 1 {
  112. update["extracttype"] = -1
  113. } else {
  114. update["extracttype"] = 1
  115. }
  116. } else {
  117. compare = nil
  118. }
  119. //下面可以多线程跑的--->
  120. //处理分类
  121. if compare != nil {
  122. subscopeclass, _ := compare["subscopeclass"].([]interface{})
  123. if subscopeclass != nil {
  124. //str := ","
  125. m1 := map[string]bool{}
  126. newclass := []string{}
  127. for _, sc := range subscopeclass {
  128. sclass, _ := sc.(string)
  129. if !m1[sclass] {
  130. m1[sclass] = true
  131. //str += sclass + ","
  132. newclass = append(newclass, sclass)
  133. }
  134. }
  135. update["s_subscopeclass"] = strings.Join(newclass, ",")
  136. update["subscopeclass"] = newclass
  137. }
  138. //处理中标企业
  139. winner, _ := compare["winner"].(string)
  140. m1 := map[string]bool{}
  141. if winner != "" {
  142. m1[winner] = true
  143. }
  144. package1 := compare["package"]
  145. if package1 != nil {
  146. packageM, _ := package1.(map[string]interface{})
  147. for _, p := range packageM {
  148. pm, _ := p.(map[string]interface{})
  149. pw, _ := pm["winner"].(string)
  150. if pw != "" {
  151. m1[pw] = true
  152. }
  153. }
  154. }
  155. compare = nil
  156. if len(m1) > 0 {
  157. //str := ","
  158. winnerarr := []string{}
  159. for k, _ := range m1 {
  160. //str += k + ","
  161. winnerarr = append(winnerarr, k)
  162. }
  163. update["s_winner"] = strings.Join(winnerarr, ",")
  164. }
  165. }
  166. //------------------对比结束
  167. //处理key descript
  168. if bkey == "" {
  169. DealInfo(&tmp, &update)
  170. }
  171. //同时保存到elastic
  172. for tk, tv := range update {
  173. tmp[tk] = tv
  174. }
  175. //对projectscope字段的索引处理
  176. ps, _ := tmp["projectscope"].(string)
  177. if ps == "" {
  178. tmp["projectscope"] = "" //= tmp["detail"]
  179. }
  180. if len(ps) > ESLEN {
  181. tmp["projectscope"] = string(([]rune(ps))[:4000])
  182. }
  183. if s_budget := fmt.Sprint(tmp["budget"]); s_budget == "" || s_budget == "<nil>" || s_budget == "null" {
  184. tmp["budget"] = nil
  185. } else if sbd, ok := tmp["budget"].(string); ok {
  186. tmp["budget"] = ObjToMoney([]interface{}{sbd, sbd})[0]
  187. }
  188. if s_bidamount := fmt.Sprint(tmp["bidamount"]); s_bidamount == "" || s_bidamount == "<nil>" || s_bidamount == "null" {
  189. tmp["bidamount"] = nil
  190. } else if sbd, ok := tmp["bidamount"].(string); ok {
  191. tmp["bidamount"] = ObjToMoney([]interface{}{sbd, sbd})[0]
  192. }
  193. UpdatesLock.Lock()
  194. // for k1, _ := range tmp {
  195. // if strings.HasSuffix(k1, "_b") || k1 == "contenthtml" {
  196. // delete(tmp, k1)
  197. // }
  198. // }
  199. go IS.Add("bidding")
  200. if qutil.IntAll(update["extracttype"]) != -1 {
  201. newTmp := map[string]interface{}{}
  202. for _, v := range biddingIndexFields {
  203. // if tmp[v] != nil {
  204. // newTmp[v] = tmp[v]
  205. // }
  206. if tmp[v] != nil {
  207. if "projectinfo" == v {
  208. mp, _ := tmp[v].(map[string]interface{})
  209. if mp != nil {
  210. newmap := map[string]interface{}{}
  211. for _, v1 := range projectinfoFields {
  212. if mp[v1] != nil {
  213. newmap[v1] = mp[v1]
  214. }
  215. }
  216. newTmp[v] = newmap
  217. attachments := mp["attachments"]
  218. con := ""
  219. if attachments != nil {
  220. am, _ := attachments.(map[string]interface{})
  221. if am != nil {
  222. for _, v1 := range am {
  223. vm, _ := v1.(map[string]interface{})
  224. if vm != nil {
  225. c, _ := vm["content"].(string)
  226. con += c
  227. }
  228. }
  229. }
  230. }
  231. if con != "" {
  232. con = FilterDetailSpace(con)
  233. newTmp["attachments"] = con
  234. }
  235. }
  236. } else {
  237. if v == "detail" {
  238. detail, _ := tmp[v].(string)
  239. newTmp[v] = FilterDetail(detail)
  240. } else {
  241. newTmp[v] = tmp[v]
  242. }
  243. }
  244. } else if v == "budget" || v == "bidamount" {
  245. newTmp[v] = nil
  246. }
  247. }
  248. arrEs = append(arrEs, newTmp)
  249. }
  250. if len(update) > 0 {
  251. arr = append(arr, []map[string]interface{}{
  252. map[string]interface{}{
  253. "_id": tmp["_id"],
  254. },
  255. map[string]interface{}{
  256. "$set": update,
  257. },
  258. })
  259. }
  260. if len(arr) >= BulkSize-1 {
  261. mgo.UpdateBulkAll(db, c, arr...)
  262. arr = [][]map[string]interface{}{}
  263. }
  264. if len(arrEs) >= BulkSize-1 {
  265. tmps := arrEs
  266. elastic.BulkSave(index, itype, &tmps, true)
  267. if len(multiIndex) == 2 {
  268. elastic.BulkSave(multiIndex[0], multiIndex[1], &tmps, true)
  269. }
  270. arrEs = []map[string]interface{}{}
  271. }
  272. UpdatesLock.Unlock()
  273. if n%100 == 0 {
  274. log.Println("current:", n)
  275. }
  276. tmp = make(map[string]interface{})
  277. }
  278. UpdatesLock.Lock()
  279. if len(arr) > 0 {
  280. mgo.UpdateBulkAll(db, c, arr...)
  281. }
  282. if len(arrEs) > 0 {
  283. tmps := arrEs
  284. elastic.BulkSave(index, itype, &tmps, true)
  285. if len(multiIndex) == 2 {
  286. elastic.BulkSave(multiIndex[0], multiIndex[1], &tmps, true)
  287. }
  288. }
  289. UpdatesLock.Unlock()
  290. return n1, n2
  291. }
  292. var client *mu.Client
  293. var reg = regexp.MustCompile("^[0-9a-zA-Z-.]+$")
  294. var reg_space = regexp.MustCompile("(?ism)(<style.*?>.*?</style>)|([.#]?\\w{1,20}\\{.*?\\})|(<.*?>)|(\\\\t)+|\\t|( +)|( +)|(" + string(rune(160)) + "+)")
  295. var reg_row = regexp.MustCompile("(?i)<(tr|div|p)[^>]*?>|(\\n)+")
  296. var reg_dh = regexp.MustCompile("[,]+")
  297. var reg_newdb = regexp.MustCompile("([:,、:,。.;])[,]")
  298. var reg_no = regexp.MustCompile("^[0-9]*$")
  299. var MSG_SERVER = "123.56.236.148:7070"
  300. var DesLen = 120
  301. func inits() {
  302. ser := qutil.ObjToString(Sysconfig["msg_server"])
  303. if ser != "" {
  304. MSG_SERVER = ser
  305. }
  306. cf := &mu.ClientConfig{
  307. ClientName: "剑鱼抽关键词",
  308. EventHandler: func(p *mu.Packet) {},
  309. MsgServerAddr: MSG_SERVER,
  310. CanHandleEvents: []int{},
  311. OnConnectSuccess: func() {
  312. log.Println("c.")
  313. },
  314. ReadBufferSize: 10,
  315. WriteBufferSize: 10,
  316. }
  317. client, _ = mu.NewClient(cf)
  318. }
  319. //var clientlock = &sync.Mutex{}
  320. var keypool = make(chan bool, 1)
  321. func DealInfo(obj, update *map[string]interface{}) {
  322. defer qutil.Catch()
  323. if (*obj)["keywords"] != nil && (*obj)["description"] != nil {
  324. return
  325. } else {
  326. (*update)["keywords"] = ""
  327. (*update)["description"] = ""
  328. }
  329. title := qutil.ObjToString((*obj)["title"])
  330. var m [][]string
  331. select {
  332. case <-func() <-chan bool {
  333. ch := make(chan bool, 1)
  334. go func(chan bool) {
  335. select {
  336. case keypool <- true:
  337. defer func() {
  338. <-keypool
  339. }()
  340. ret, _ := client.Call("", mu.UUID(8), 4010, mu.SENDTO_TYPE_RAND_RECIVER, title, 1)
  341. json.Unmarshal(ret, &m)
  342. case <-time.After(10 * time.Millisecond):
  343. }
  344. ch <- true
  345. }(ch)
  346. return ch
  347. }():
  348. case <-time.After(40 * time.Millisecond):
  349. }
  350. arr := []string{}
  351. keyword := []string{}
  352. keywordnew := []string{}
  353. for _, tmp := range m {
  354. if reg.MatchString(tmp[0]) {
  355. arr = append(arr, tmp[0])
  356. } else {
  357. if len(arr) > 0 {
  358. str := strings.Join(arr, "")
  359. keyword = append(keyword, str)
  360. arr = []string{}
  361. }
  362. if len(tmp[0]) > 3 && (strings.HasPrefix(tmp[1], "n") || tmp[1] == "v" || tmp[1] == "vn" || strings.HasPrefix(tmp[1], "g")) {
  363. keyword = append(keyword, tmp[0])
  364. }
  365. }
  366. }
  367. for _, v := range keyword {
  368. v = reg_no.ReplaceAllString(v, "")
  369. if len(v) > 0 {
  370. keywordnew = append(keywordnew, v)
  371. }
  372. }
  373. keywords := strings.Join(keywordnew, ",")
  374. (*update)["keywords"] = keywords
  375. content := ""
  376. if (*obj)["detail_bak"] != nil {
  377. content = qutil.ObjToString((*obj)["detail_bak"])
  378. } else {
  379. content = qutil.ObjToString((*obj)["detail"])
  380. }
  381. //内容替换
  382. content = strings.Replace(content, " ", "", -1)
  383. content = reg_space.ReplaceAllString(content, "")
  384. content = reg_row.ReplaceAllString(content, ",")
  385. content = reg_dh.ReplaceAllString(content, ",")
  386. content = reg_newdb.ReplaceAllString(content, "$1")
  387. if strings.HasPrefix(content, ",") {
  388. content = content[1:]
  389. }
  390. //log.Println(content)
  391. tc := []rune(content)
  392. ltc := len(tc)
  393. description := content
  394. if ltc > DesLen {
  395. description = string(tc[:DesLen])
  396. }
  397. (*update)["description"] = description
  398. //保存到数据库
  399. return
  400. }