util.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. package main
  2. import (
  3. util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  4. "regexp"
  5. "strconv"
  6. "strings"
  7. )
  8. type TagMatching struct {
  9. tagName string //标签名称
  10. tagCode string //标签值(保存)
  11. matchField []string //关键词匹配字段,title,detail
  12. matchKey string //匹配的词语,多个使用逗号连接,"部队,国防,军事,军用"
  13. matchKeyReg []*RegexpInfo //关键词的正则
  14. matchOutKey string //匹配关键词的同时,排除关键词中的特殊词
  15. matchOutReg []*RegexpInfo //关键词中排除词的正则
  16. addField []string //附加词匹配字段
  17. addKey string //附件词匹配关键词
  18. addKeyReg []*RegexpInfo //附加次的正则
  19. addOutKey string //满足附加词同时,需要排除的关键词,排除字段还是 addKey
  20. addOutReg []*RegexpInfo //满足附加词中排除词的正则表达式
  21. excludeField []string //全局排除词
  22. excludeKey string //全局排除词匹配词
  23. excludeKeyReg []*RegexpInfo
  24. clearKey []string //清理词匹配字段跟关键词一样
  25. buyerclass string //采购单位类型字段
  26. }
  27. type RegexpInfo struct {
  28. keyStr string
  29. regs *regexp.Regexp
  30. }
  31. // GetRegex 根据关键词或者对应正则
  32. func GetRegex(key string) []*RegexpInfo {
  33. var infos []*RegexpInfo
  34. for _, s := range strings.Split(key, ",") {
  35. if strings.Contains(s, "&&") || strings.Contains(s, "&!") {
  36. info := &RegexpInfo{
  37. keyStr: s,
  38. regs: nil,
  39. }
  40. infos = append(infos, info)
  41. } else {
  42. info := &RegexpInfo{
  43. keyStr: s,
  44. regs: regexp.MustCompile(".*(?i)" + s + ".*"),
  45. }
  46. infos = append(infos, info)
  47. }
  48. }
  49. return infos
  50. }
  51. // TaskTags 根据数据和正则规则,验证数据标签
  52. func TaskTags(tmp map[string]interface{}, regs []TagMatching) (tags []string) {
  53. for _, v := range regs {
  54. // 1.排除词
  55. if len(v.excludeField) > 0 && len(v.excludeKeyReg) > 0 {
  56. // 遍历排除词对应的tmp字段信息
  57. for _, f := range v.excludeField {
  58. if val := util.ObjToString(tmp[f]); val != "" {
  59. if getRegsResult(val, v.excludeKeyReg) {
  60. return
  61. }
  62. }
  63. }
  64. }
  65. // 清理词;目的把 类似 fuck的单词替换为空字符串
  66. if len(v.clearKey) > 0 && len(v.matchField) > 0 {
  67. for _, s := range v.clearKey {
  68. for _, f := range v.matchField {
  69. if val := util.ObjToString(tmp[f]); val != "" {
  70. tmp[f] = strings.ReplaceAll(val, s, "")
  71. }
  72. }
  73. }
  74. }
  75. // 关键词
  76. if len(v.matchField) > 0 && len(v.matchKeyReg) > 0 {
  77. for _, f := range v.matchField {
  78. if val := util.ObjToString(tmp[f]); val != "" {
  79. //判断关键词字段的排除情况,含关键词的排除词直接退出
  80. if len(v.matchOutReg) > 0 && getRegsResult(val, v.matchOutReg) {
  81. return
  82. }
  83. // 符合关键词条件
  84. if getRegsResult(val, v.matchKeyReg) {
  85. // 附加词含有排除词时
  86. if len(v.addOutReg) > 0 && getRegsResult(val, v.addOutReg) {
  87. return
  88. }
  89. if len(v.addField) > 0 && len(v.addKeyReg) > 0 {
  90. // 不满足附加词,直接返回
  91. if !getRegsResult(val, v.addKeyReg) {
  92. continue
  93. }
  94. }
  95. tags = append(tags, v.tagName)
  96. }
  97. }
  98. }
  99. }
  100. }
  101. return
  102. }
  103. // getRegsResult 验证数据是否符合数组正则
  104. func getRegsResult(data string, regs []*RegexpInfo) (res bool) {
  105. for _, e1 := range regs {
  106. if e1.regs != nil && e1.regs.MatchString(data) {
  107. return true
  108. } else {
  109. // && 特殊处理
  110. if strings.Contains(e1.keyStr, "&&") {
  111. flag := true
  112. for _, s := range strings.Split(e1.keyStr, "&&") {
  113. if !strings.Contains(data, s) {
  114. flag = false
  115. break
  116. }
  117. }
  118. if flag {
  119. return true
  120. }
  121. }
  122. // 前面是必须有的关键词&!,后面是不能有的关键词;比如 军队&!点军队,
  123. if strings.Contains(e1.keyStr, "&!") {
  124. keys := strings.Split(e1.keyStr, "&!")
  125. if strings.Contains(data, keys[0]) && !strings.Contains(data, keys[1]) {
  126. return true
  127. }
  128. }
  129. }
  130. }
  131. return false
  132. }
  133. var (
  134. regNumFloat, _ = regexp.Compile(`([1-9]\d*|0)(\.\d+)?`)
  135. regStrUnit, _ = regexp.Compile(`[元|万|亿]`)
  136. contentUnit, _ = regexp.Compile(`(万元|单位/万)`)
  137. numCapitals, _ = regexp.Compile(`([〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]{4,40})`)
  138. regStrChar = `[〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]`
  139. moneyRegChar, _ = regexp.Compile(regStrChar)
  140. regQianw, _ = regexp.Compile(`\d{1,2}千万`)
  141. cutAllSpace, _ = regexp.Compile(`\s*`)
  142. spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"}
  143. moneyChar = map[string]interface{}{ //"〇": "0", "零": "0",
  144. "一": float64(1), "壹": float64(1), "二": float64(2), "贰": float64(2), "三": float64(3), "叁": float64(3), "四": float64(4), "肆": float64(4), "五": float64(5), "伍": float64(5),
  145. "六": float64(6), "陆": float64(6), "七": float64(7), "柒": float64(7), "八": float64(8), "捌": float64(8), "九": float64(9), "玖": float64(9), "十": float64(10), "拾": float64(10),
  146. "百": float64(100), "佰": float64(100), "千": float64(1000), "仟": float64(1000), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000),
  147. "零": float64(0), "点": ".", "角": float64(0.1), "分": float64(0.01),
  148. }
  149. moneyUnit = map[string]float64{
  150. "元": float64(1), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), //单位
  151. }
  152. )
  153. var currencyItem = map[string]string{
  154. "人民币": "人民币",
  155. "rmb": "人民币",
  156. "RMB": "人民币",
  157. "$": "美元",
  158. "$": "美元",
  159. "美元": "美元",
  160. "港元": "港币",
  161. "港币": "港币",
  162. "澳币": "澳币",
  163. "澳元": "澳币",
  164. }
  165. // 获取币种
  166. func GetCurrency(text string) (currency string) {
  167. if text == "" {
  168. return
  169. }
  170. currency = "人民币"
  171. for k, v := range currencyItem {
  172. if strings.Contains(text, k) {
  173. currency = v
  174. return
  175. }
  176. }
  177. return
  178. }
  179. // 金额转换
  180. func ObjToMoney(text string) float64 {
  181. isfindUnit := true
  182. ret := capitalMoney(text)
  183. if ret < float64(10000) || ret > float64(50000000000) {
  184. ret2, b := numMoney(text)
  185. isfindUnit = b
  186. if ret2 > ret {
  187. ret = ret2
  188. }
  189. }
  190. f, _ := strconv.ParseFloat(strconv.FormatFloat(ret, 'f', 4, 64), 64)
  191. // if f < 1 {
  192. // f = 0
  193. // }
  194. //如果金额小于50,全文检索单位:万
  195. if f < 50 && f > 0 && isfindUnit {
  196. rep := contentUnit.FindAllStringIndex(text, -1)
  197. if len(rep) > 0 {
  198. f = f * 10000
  199. }
  200. }
  201. return f
  202. }
  203. func capitalMoney(text string) float64 {
  204. nodes := []float64{}
  205. node := float64(0)
  206. tmp := float64(0)
  207. decimals := 0.0
  208. ishaspoint := false //是否含小数点
  209. fnum := float64(0)
  210. end := false
  211. //str := fmt.Sprint(data[0])
  212. //提取第一个大写信息
  213. strmatch := numCapitals.FindAllStringSubmatch(text, -1)
  214. if len(strmatch) > 0 {
  215. text = strmatch[0][0]
  216. }
  217. suffixUnit := float64(1)
  218. if strings.HasSuffix(text, "万") || strings.HasSuffix(text, "万元") || strings.HasSuffix(text, "万元整") {
  219. index := strings.LastIndex(text, "万")
  220. text = text[0:index]
  221. suffixUnit = float64(10000)
  222. }
  223. moneyRegChar.ReplaceAllStringFunc(text, func(key string) string {
  224. if key == "元" || key == "圆" || key == "点" {
  225. ishaspoint = true
  226. }
  227. if v, ok := moneyChar[key].(float64); ok && !end {
  228. if ishaspoint && v > 10 { //排除后面有其他的单位
  229. return ""
  230. }
  231. //fmt.Println(key, v, fnum)
  232. if v < 10 && v >= 0 {
  233. if ishaspoint { //小数部分
  234. if v >= 1 {
  235. fnum = v
  236. } else if v < 1 && v > 0 {
  237. decimals += fnum * v
  238. }
  239. } else {
  240. if tmp != float64(0) {
  241. node += tmp
  242. }
  243. tmp = float64(v)
  244. }
  245. } else if v == 10000 || v == 100000000 { //单位万、亿
  246. if tmp != float64(0) {
  247. node += tmp
  248. tmp = float64(0)
  249. }
  250. nodes = append(nodes, node*float64(v))
  251. node = float64(0)
  252. } else {
  253. if v == 10 && tmp == 0 {
  254. tmp = 1
  255. }
  256. tmp = tmp * float64(v)
  257. node += tmp
  258. tmp = float64(0)
  259. }
  260. }
  261. if key == "整" || key == "正" || key == "分" {
  262. end = true
  263. }
  264. return ""
  265. })
  266. nodes = append(nodes, node, tmp)
  267. ret := float64(0)
  268. for _, v := range nodes {
  269. ret += v
  270. }
  271. return (ret + decimals) * suffixUnit
  272. }
  273. // 数字金额转换
  274. func numMoney(text string) (moneyFloat float64, flag bool) {
  275. //tmp := fmt.Sprintf("%f", data[0])
  276. repUnit := float64(1)
  277. if regQianw.MatchString(text) {
  278. text = strings.Replace(text, "千万", "万", -1)
  279. repUnit = float64(1000)
  280. }
  281. text = replaceSymbol(text, []string{",", ",", "(", ")", "(", ")", ":", "\n"})
  282. text = replaceString(text, []string{"万元", "亿元", "."}, []string{"万", "亿", "."})
  283. text = CutAllSpace(text)
  284. rets := regNumFloat.FindAllString(text, -1)
  285. fnums := []float64{}
  286. unitstrs := []string{}
  287. if len(rets) > 0 {
  288. pindex := 0 //单位前置
  289. for k, v := range rets {
  290. f, err := strconv.ParseFloat(v, 64)
  291. if err == nil {
  292. fnums = append(fnums, f)
  293. index := strings.Index(text, v)
  294. //单位后置
  295. start := index + len(v)
  296. end := start + 3
  297. //log.Println("vvv", tmp, v, pindex, index, start)
  298. if k > 0 {
  299. if start >= pindex+3 {
  300. pstart := pindex + 3
  301. if pstart >= index {
  302. pstart = index
  303. }
  304. if len(text) > end {
  305. unitstrs = append(unitstrs, text[pstart:index]+text[start:end])
  306. } else {
  307. unitstrs = append(unitstrs, text[pstart:index]+text[start:])
  308. }
  309. } else {
  310. if len(text) > end {
  311. unitstrs = append(unitstrs, text[start:end])
  312. } else {
  313. unitstrs = append(unitstrs, text[start:])
  314. }
  315. }
  316. } else {
  317. if len(text) > end {
  318. if index-3 >= 0 {
  319. unitstrs = append(unitstrs, text[index-3:index]+text[start:end])
  320. } else {
  321. unitstrs = append(unitstrs, text[start:end])
  322. }
  323. } else {
  324. if index-3 >= 0 {
  325. unitstrs = append(unitstrs, text[index-3:index]+text[start:])
  326. } else {
  327. unitstrs = append(unitstrs, text[start:])
  328. }
  329. }
  330. }
  331. pindex = start
  332. }
  333. }
  334. }
  335. //log.Println("unitstrs", fnums, unitstrs)
  336. unit := float64(0)
  337. fnum := float64(0)
  338. for k, v := range fnums {
  339. fnum = v
  340. units := regStrUnit.FindAllString(unitstrs[k], -1)
  341. for _, v := range units {
  342. if moneyUnit[v] != 0 {
  343. unit = moneyUnit[v]
  344. break
  345. }
  346. }
  347. if unit != float64(0) { //取第一个
  348. break
  349. }
  350. }
  351. fnum = fnum * repUnit
  352. if unit == float64(0) {
  353. moneyFloat = fnum
  354. } else {
  355. moneyFloat = fnum * unit
  356. }
  357. if unit == 10000 {
  358. flag = false
  359. } else {
  360. flag = true
  361. }
  362. return
  363. }
  364. // 清理所有空白符
  365. func CutAllSpace(text string) string {
  366. tmp := cutAllSpace.ReplaceAllString(text, "")
  367. tmp = replaceSymbol(tmp, spaces)
  368. return tmp
  369. }
  370. // 符号替换
  371. func replaceString(con string, ret, rep []string) string {
  372. for k, v := range ret {
  373. if len(rep) > k {
  374. con = strings.Replace(con, v, rep[k], -1)
  375. }
  376. }
  377. return con
  378. }
  379. // 过滤符号
  380. func replaceSymbol(con string, rep []string) string {
  381. for _, v := range rep {
  382. con = strings.Replace(con, v, "", -1)
  383. }
  384. return con
  385. }
  386. // MatchField 判断valus 是否包含key 字符串
  387. func MatchField(keys []string, values []string) (ok bool) {
  388. var matchKeyRegs []*RegexpInfo
  389. if len(keys) > 0 {
  390. for _, key := range keys {
  391. KeyReg := GetRegex(key)
  392. matchKeyRegs = append(matchKeyRegs, KeyReg...)
  393. }
  394. for _, reg := range matchKeyRegs {
  395. for _, val := range values {
  396. if reg.regs.MatchString(val) {
  397. return true
  398. }
  399. }
  400. }
  401. }
  402. return
  403. }
  404. // SumFields 计算map 包含的字段个数
  405. func SumFields(keys []string, val map[string]interface{}) (res int) {
  406. for _, v := range keys {
  407. if _, ok := val[v]; ok {
  408. res++
  409. }
  410. }
  411. return
  412. }
  413. // containsChinese 识别含有中文
  414. func containsChinese(str string) bool {
  415. result, _ := regexp.MatchString(`[\x{4e00}-\x{9fa5}]+`, str)
  416. if result {
  417. return true
  418. }
  419. return false
  420. }