频道栏目
首页 > 网络 > 云计算 > 正文

动态正则匹配

2017-01-06 09:24:59         来源:北京小辉的博客  
收藏   我要投稿

动态正则匹配:1、写一个动态正则;2、只要写出日志的Schma就可以获取到日志的正则。

package com.donews.util

import java.util.regex.Pattern

import scala.collection.mutable.ArrayBuffer

/**
  * Created by yuhui on 2016/8/5.
  */

/***
列子:       www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +https://www.baidu.com/search/spider.html)" China 22 Beijing
第一版本    "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" $country $region $city"

例子 :      www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +https://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"
第二版本    "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""

例子 :     www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" "https://www.donews.com/media/201408/2834414.shtm" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +https://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"
第三版本    $domain $http_x_forwarded_for - $remote_user [$timestamp] "$http_url" "$url" $status $body_bytes_sent "$http_referer" "$http_user_agent" "$e_ip" "$country" "$region" "$city"
 */


object DynamicRegex{

  var cmd = ""

  var regex =""

  def tran(cmd: String): String = {
    val sb = new StringBuffer()
    sb.append("^")
    val regex = "^(\\W+)$"
    val p = Pattern.compile(regex)
    cmd.split(" ").foreach(key =>
      if (!p.matcher(key).find()) {
        key.substring(0, key.indexOf("$"))
        match {
          case "" =>
            if (key.split("\\$").length > 2) {
              var split = ""
              val regex = "(\\$\\w+)(\\W+)(\\$\\w+)(.*)"
              val p = Pattern.compile(regex)
              val m = p.matcher(key)
              while (m.find()) {
                split = m.group(2)
              }
              sb.append("(")
              for (i <- Range(0, key.split("\\$").length - 1, 1)) {
                if (i < key.split("\\$").length - 2) {
                  sb.append("[\\S]+[" + split + "]")
                } else {
                  sb.append("[\\S]+")
                }
              }
              sb.append(")\\s")
            } else {
              sb.append("([\\S]+)\\s")
            }
          case _ =>
            val regex = "(\\W+)(\\$\\w+)(\\W+)"
            val p = Pattern.compile(regex)
            val m = p.matcher(key)
            if (m.find) {
              val pre = m.group(1)
              val end = m.group(3)
              sb.append("(" + escape(pre) + ".+" + escape(end) + ")\\s")
            }
        }
      }else{
        sb.append("(\\W+)\\s")
      }
    )
    val str = sb.toString
    str.substring(0, str.length - 2).concat("$")
  }

  def escape(original: String): String = {
    val tb = new StringBuffer()
    for (i <- Range(0, original.length(), 1)) {
      if ("\"".equals(original.charAt(i).toString)) {
      } else {
        tb.append("\\")
      }
      tb.append(original.charAt(i))
    }
    tb.toString
  }

  def lineToGroup(line: String): ArrayBuffer[String] = {
    val groups = ArrayBuffer[String]()
    val p = Pattern.compile(regex)
    val m = p.matcher(line)
    while (m.find()) {
      for (i <- Range(1, m.groupCount() + 1, 1)) {
        groups.append(m.group(i))
      }
    }
    groups
  }

  def main(args: Array[String]): Unit = {

    cmd = "$domain $http_x_forwarded_for - $remote_user [$timestamp] \"$http_url\" \"$url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""
    regex=tran(cmd)
    println(regex)
    val log = "www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] \"GET /media/201408/2834414.shtm HTTP/1.1\" \"https://www.donews.com/media/201408/2834414.shtm\" 200 11296 \"-\" \"Mozilla/5.0 (compatible; Baiduspider/2.0; +https://www.baidu.com/search/spider.html)\" \"-\" \"China\" \"22\" \"Beijing\""
    lineToGroup(log).foreach(x=>println(x))

  }
}

输出结果:

^([\S]+)\s([\S]+)\s(\W+)\s([\S]+)\s(\[.+\])\s(".+")\s(".+")\s([\S]+)\s([\S]+)\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")$
www.donews.com
123.125.71.72
-
-
[28/Nov/2016:11:08:50 +0800]
"GET /media/201408/2834414.shtm HTTP/1.1"
"https://www.donews.com/media/201408/2834414.shtm"
200
11296
"-"
"Mozilla/5.0 (compatible; Baiduspider/2.0; +https://www.baidu.com/search/spider.html)"
"-"
"China"
"22"
"Beijing"
上一篇:Zookeeper的安装部署
下一篇:Go语言中的队列和堆栈实例代码
相关文章
图文推荐

关于我们 | 联系我们 | 广告服务 | 投资合作 | 版权申明 | 在线帮助 | 网站地图 | 作品发布 | Vip技术培训 | 举报中心

版权所有: 红黑联盟--致力于做实用的IT技术学习网站