具备基本的hadoop分布式文件系统操作与mapreduce计算框架知识——大数据之hadoop伪集群搭建与MapReduce编程入门
wget http://mirrors.cnnic.cn/apache/pig/pig-0.15.0/pig-0.15.0.tar.gz .
tar zxvf pig-0.15.0.tar.gzcd pig-0.15.0/vim bin/pig修改JAVA_HOME=/opt/jdk1.7.0_67HADOOP_HOME=/opt/hadoop-2.7.1HADOOP_CONF_DIR=/opt/hadoop-2.7.1/etc/hadoop
vim ~/.bashrc修改JAVA_HOME=/opt/jdk1.7.0_67export PATH=$JAVA_HOME/bin:$PATHHADOOP_HOME=/opt/hadoop-2.7.1export PATH=$HADOOP_HOME/bin:$PATHPIG_HOME=/opt/pig-0.15.0/export PATH=$PIG_HOME/bin:$PATH
ANT_HOME=/opt/apache-ant-1.9.6export PATH=$ANT_HOME/bin:$PATH
vim conf/pig.properties编辑# Should scripts check to prevent multiple stores writing to the same location?# (default: false) When set to true, stops the execution of script right away.# 避免存储STORE同一位置的结果覆盖pig.location.check.strict=true
交互式:
fs -ls / # fs hadoop 文件系统操作
sh ls #shell命令
pig --help
pig -e 'fs -ls /' #命令执行
pig -c test.pig #pig脚本语法检查
pig -f test.pig #pig脚本运行
cd /home/work/lidanqing01/pig-0.15.0/tutorialvim build.xml编辑 添加pig,hadoop依赖lib<path id="tutorial.classpath"><fileset dir="../lib/"><include name="*.jar"/></fileset><fileset dir="../lib/hadoop1-runtime/"><include name="*.jar"/></fileset><fileset dir=".."><include name="pig*-core-*.jar"/></fileset><pathelement location="${build.classes}"/><pathelement location="${pigjar}"/></path>
anttar zxvf pigtutorial.tar.gzcd pigtmp #这个目录下为实例pig脚本
pig -x local script1-local.pig
hadoop fs -mkdir -p /user/root
hadoop fs -put excite.log.bz2 . #将日志存放到hdfs /user/{user}/excite.log.bz2
pig -f script1-hadoop.pig
3. pig UDF udf_pig.pyraw_log_entries = LOAD 'mysql.log' using TextLoader AS (line:chararray);
define sqlicheck `python normalize_mapper.py` ship('normalize_mapper.py');
stream_log_entries = stream raw_log_entries through sqlicheck as(sql_base64:chararray, time_base64:chararray);
group_log_entries = GROUP stream_log_entries by sql_base64;
log_count = foreach group_log_entries generate flatten(group) as sql_base64:chararray, COUNT(stream_log_entries) as count;
log_count_order = order log_count by count desc;
register 'udf_pig.py' using jython as udf_tool;
decode_base64_log_count = foreach log_count_order generate udf_tool.base64_decode(sql_base64), count;
store decode_base64_log_count into 'sql_count';
4. pig streaming normalize_mapper.py (http://danqingdani.blog.163.com/blog/static/186094195201611673420929/ )import base64
@outputSchema("sql:chararray")
def base64_decode(s):
return base64.b64decode(s)
结果如下pig -x local -f db_parse.pig
more sql_count/part-r-00000
select @@version_comment limit 1 1501