对应正则
实例
解析中的问题:160114 11:15:02 903 Query BEGIN
903 Query REPLACE INTO heartbeat SET id='abc_0000', value=142341302
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ParserRequestWritable.class);job.setNumMapTasks(0);job.setReducerClass(UniqueReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(ParserRequestWritable.class);
$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-streaming.jar \
-input myInputDirs \
-output myOutputDir \
-mapper org.apache.hadoop.mapred.lib.IdentityMapper \
-reducer /bin/wc
vim ~/.bashrc
HADOOP_HOME=/home/{user}/hadoop-2.6.0alias HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar"
source ~/.bashrc
HSTREAMING -info
(1).Python 编译
(2). 编写mapper程序 normalize_mapper.pywget https://www.python.org/ftp/python/2.7.10/Python-2.7.10.tgz
cd Python-2.7.10compile./configure --prefix=lib/Python27make -jmake install
(3).编写reducer程序 normalize_reducer.py# -*- coding: utf-8 -*-
import sys
import json
import re
import logging
import base64
#db log format configure
db_pattern = r"^(\d{2}\d{2}\d{2}\s+\d{1,2}:\d{2}:\d{2}|\t)\s+\d+\s+([A-Za-z]+)\s*(.*)$"
db_regex = re.compile(db_pattern)
sql_pattern = r"^(\S+)\s"
sql_regex = re.compile(sql_pattern)
#log configure 用于调试是否正确解析,例如SQL拼接是否正确
logging.basicConfig(level = logging.ERROR,
format = '%(message)s',
stream = sys.stderr)
#query blacklist configure 只处理query语句
command_blacklist = [
"query"
]
query_blacklist = [
"select",
"update",
"insert",
"delete",
"replace"
]
def read_mapper_output(file):
"""
read data from file using yield
"""
for line in file:
yield line.rstrip()
def db_log_normailze():
"""
normalize db log, extend timestamp and merge multi-line sql statement
"""
#read data from stdin
data = read_mapper_output(sys.stdin)
#last time
last_time = "\t"
#current time command and sql
time = ""
command = ""
sql = ""
line_number = 1
for line in data:
db_match = db_regex.search(line)
if db_match:
if command != "":
if sql and command.lower() in command_blacklist:
sql_match = sql_regex.search(sql)
if sql_match:
sql_command = sql_match.group(1)
if sql_command.lower() in query_blacklist:
debug = "FINAL_RESULT %d: %s %s %s" %(line_number - 1, time, command, sql)
logging.debug(debug)
sql_base64 = base64.b64encode(sql)
time_base64 = base64.b64encode(time)
print "%s\t%s" %(sql_base64, time_base64)
else:
info ="NULL_COMMAND %d: %s %s %s" %(line_number - 1, time, command, sql)
logging.info(info)
time, command, sql = db_match.groups()
#time extend
if time == "\t":
time = last_time
else:
last_time = time
else:
#for debug
info = "MULTI_LINE %d: %s" %(line_number, line.strip())
logging.info(info)
if command != "":
sql = sql + line
line_number = line_number + 1
if __name__ == '__main__':
db_log_normailze()
(4). 本地测试#!/usr/bin/env python
import sys
import re
import base64
import logging
from itertools import groupby
from operator import itemgetter
import sqli_check
import udf_tool
#log configure
logging.basicConfig(level = logging.ERROR,
format = '%(message)s',
stream = sys.stderr)
def read_mapper_output(file, separator='\t'):
"""
read data from file and split each line into k,v pair
"""
for line in file:
yield line.strip().split(separator, 1)
def db_log_sql_parse():
data = read_mapper_output(sys.stdin, separator='\t')
for sql_base64, group in groupby(data, itemgetter(0)):
num_of_request = 0
time_list = set()
"""
k: sql_base64
v: time_base64
"""
for k, v in group:
time = base64.b64decode(v)
num_of_request = num_of_request + 1
time_list.add(time)
sql_parser_result = sqli_check.parser(sql_base64)
(5).集群运行cat ../../data/mysql.log |../../lib/Python27/bin/python normalize_mapper.py |sort -k1|../../lib/Python27/bin/python normalize_reducer.py 1>result 2>debug &
(2)任务运行tar zcvf sqlicheck.tar.gz lib/ src/
hadoop fs -put sqlicheck.tar.gz /
b. 版本1.xHSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar"
$HSTREAMING \
-D mapred.job.name='normalize db log' \
-archives "hdfs://xxx.xxx.xxx.xxx:xxx/sqlicheck.tar.gz#sqlicheck" \
-input $INPUT_PATH \
-output $OUTPUT_PATH \
-mapper "sqlicheck/lib/Python27/bin/python sqlicheck/src/main/normalize_mapper.py " \
-reducer "sqlicheck/lib/Python27/bin/python sqlicheck/src/main/normalize_reducer.py"
为什么是sh,不是py,这里有个悲催的原因,见下面$HADOOP_HOME/bin/hadoop --config $HADOOP_HOME/conf streaming \
-D mapred.job.name='normalize db log' \
-input $INPUT_PATH \
-output $OUTPUT_PATH \
-mapper "sh sqlicheck/src/main/normalize_mapper.sh " \
-reducer "sh sqlicheck/src/main/normalize_reducer.sh" \
-cacheArchive/sqlicheck/sqlicheck.tar.gz#sqlicheck
1. 解决hadoop集群, 第三方python版本依赖库缺失问题
当出现以上错误的时候,就是你上传的python环境的依赖库缺失了/lib64/libc.so.6: version `GLIBC_2.14' not found
ldd lib/Python27/bin/python
在python运行前,先导入LD_LIBRARY_PATH位置cp /lib/x86_64-linux-gnu/libpthread.so.0 lib/Python27/lib/
cp /lib/x86_64-linux-gnu/libdl.so.2 lib/Python27/lib/cp /lib/x86_64-linux-gnu/libutil.so.1 lib/Python27/lib/cp /lib/x86_64-linux-gnu/libm.so.6 lib/Python27/lib/cp /lib/x86_64-linux-gnu/libc.so.6 lib/Python27/lib/
vim normalize_mapper.sh
这也是为啥是sh,不是py了export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:sqlicheck/lib/Python27/lib
sqlicheck/lib/Python27/bin/python sqlicheck/src/main/normalize_mapper.py