动态正则匹配:1、写一个动态正则;2、只要写出日志的Schma就可以获取到日志的正则。
package com.donews.util import java.util.regex.Pattern import scala.collection.mutable.ArrayBuffer /** * Created by yuhui on 2016/8/5. */ /*** 列子: www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" China 22 Beijing 第一版本 "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" $country $region $city" 例子 : www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing" 第二版本 "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\"" 例子 : www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" "http://www.donews.com/media/201408/2834414.shtm" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing" 第三版本 $domain $http_x_forwarded_for - $remote_user [$timestamp] "$http_url" "$url" $status $body_bytes_sent "$http_referer" "$http_user_agent" "$e_ip" "$country" "$region" "$city" */ object DynamicRegex{ var cmd = "" var regex ="" def tran(cmd: String): String = { val sb = new StringBuffer() sb.append("^") val regex = "^(\\W+)$" val p = Pattern.compile(regex) cmd.split(" ").foreach(key => if (!p.matcher(key).find()) { key.substring(0, key.indexOf("$")) match { case "" => if (key.split("\\$").length > 2) { var split = "" val regex = "(\\$\\w+)(\\W+)(\\$\\w+)(.*)" val p = Pattern.compile(regex) val m = p.matcher(key) while (m.find()) { split = m.group(2) } sb.append("(") for (i <- Range(0, key.split("\\$").length - 1, 1)) { if (i < key.split("\\$").length - 2) { sb.append("[\\S]+[" + split + "]") } else { sb.append("[\\S]+") } } sb.append(")\\s") } else { sb.append("([\\S]+)\\s") } case _ => val regex = "(\\W+)(\\$\\w+)(\\W+)" val p = Pattern.compile(regex) val m = p.matcher(key) if (m.find) { val pre = m.group(1) val end = m.group(3) sb.append("(" + escape(pre) + ".+" + escape(end) + ")\\s") } } }else{ sb.append("(\\W+)\\s") } ) val str = sb.toString str.substring(0, str.length - 2).concat("$") } def escape(original: String): String = { val tb = new StringBuffer() for (i <- Range(0, original.length(), 1)) { if ("\"".equals(original.charAt(i).toString)) { } else { tb.append("\\") } tb.append(original.charAt(i)) } tb.toString } def lineToGroup(line: String): ArrayBuffer[String] = { val groups = ArrayBuffer[String]() val p = Pattern.compile(regex) val m = p.matcher(line) while (m.find()) { for (i <- Range(1, m.groupCount() + 1, 1)) { groups.append(m.group(i)) } } groups } def main(args: Array[String]): Unit = { cmd = "$domain $http_x_forwarded_for - $remote_user [$timestamp] \"$http_url\" \"$url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\"" regex=tran(cmd) println(regex) val log = "www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] \"GET /media/201408/2834414.shtm HTTP/1.1\" \"http://www.donews.com/media/201408/2834414.shtm\" 200 11296 \"-\" \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\" \"-\" \"China\" \"22\" \"Beijing\"" lineToGroup(log).foreach(x=>println(x)) } }
输出结果:
^([\S]+)\s([\S]+)\s(\W+)\s([\S]+)\s(\[.+\])\s(".+")\s(".+")\s([\S]+)\s([\S]+)\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")$ www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" "http://www.donews.com/media/201408/2834414.shtm" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"