锘??xml version="1.0" encoding="utf-8" standalone="yes"?>依依成人综合视频,日本伊人精品一区二区三区观看方式,9久re热视频在线精品http://www.aygfsteel.com/paulwong/category/53479.htmlzh-cnTue, 23 Apr 2013 21:36:25 GMTTue, 23 Apr 2013 21:36:25 GMT60涓涓狿IG鑴氭湰渚嬪瓙鍒嗘瀽http://www.aygfsteel.com/paulwong/archive/2013/04/13/397791.htmlpaulwongpaulwongSat, 13 Apr 2013 07:21:00 GMThttp://www.aygfsteel.com/paulwong/archive/2013/04/13/397791.htmlhttp://www.aygfsteel.com/paulwong/comments/397791.htmlhttp://www.aygfsteel.com/paulwong/archive/2013/04/13/397791.html#Feedback0http://www.aygfsteel.com/paulwong/comments/commentRss/397791.htmlhttp://www.aygfsteel.com/paulwong/services/trackbacks/397791.html
PIGGYBANK_PATH=$PIG_HOME/contrib/piggybank/java/piggybank.jar
INPUT=pig/input/test-pig-full.txt
OUTPUT=pig/output/test-pig-output-$(date  +%Y%m%d%H%M%S)
PIGSCRIPT=analyst_status_logs.pig

#analyst_500_404_month.pig
#
analyst_500_404_day.pig
#
analyst_404_percentage.pig
#
analyst_500_percentage.pig
#
analyst_unique_path.pig
#
analyst_user_logs.pig
#
analyst_status_logs.pig


pig -p PIGGYBANK_PATH=$PIGGYBANK_PATH -p INPUT=$INPUT -p OUTPUT=$OUTPUT $PIGSCRIPT


瑕佸垎鏋愮殑鏁版嵁婧愶紝LOG 鏂囦歡
46.20.45.18 - - [25/Dec/2012:23:00:25 +0100] "GET / HTTP/1.0" 302 - "-" "Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)" "-" "-" 46.20.45.18 "" 11011AEC9542DB0983093A100E8733F8 0
46.20.45.18 - - [25/Dec/2012:23:00:25 +0100] "GET /sign-in.jspx HTTP/1.0" 200 3926 "-" "Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)" "-" "-" 46.20.45.18 "" 11011AEC9542DB0983093A100E8733F8 0
69.59.28.19 - - [25/Dec/2012:23:01:25 +0100] "GET / HTTP/1.0" 302 - "-" "Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)" "-" "-" 69.59.28.19 "" 36D80DE7FE52A2D89A8F53A012307B0A 15


PIG鑴氭湰錛?br />
--娉ㄥ唽JAR鍖咃紝鍥犱負(fù)瑕佺敤鍒癉ateExtractor
register '$PIGGYBANK_PATH';

--澹版槑涓涓煭鍑芥暟鍚?br />DEFINE DATE_EXTRACT_MM 
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM');

DEFINE DATE_EXTRACT_DD 
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');

-- pig/input/test-pig-full.txt
--鎶婃暟鎹粠鍙橀噺鎵鎸囩殑鏂囦歡鍔犺澆鍒癙IG涓紝騫跺畾涔夋暟鎹垪鍚嶏紝姝ゆ椂鐨勬暟鎹泦涓烘暟緇?a,b,c)
raw_logs = load '$INPUT' USING org.apache.pig.piggybank.storage.MyRegExLoader('^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] "(\\S+) (\\S+) (HTTP[^"]+)" (\\S+) (\\S+) "([^"]*)" "([^"]*)" "(\\S+)" "(\\S+)" (\\S+) "(.*)" (\\S+) (\\S+)')
as (remoteAddr: chararray, 
n2: chararray, 
n3: chararray, 
time: chararray, 
method: chararray,
path:chararray,
protocol:chararray,
status: int, 
bytes_string: chararray, 
referrer: chararray, 
browser: chararray, 
n10:chararray,
remoteLogname: chararray, 
remoteAddr12: chararray, 
path2: chararray, 
sessionid: chararray, 
n15: chararray
);

--榪囨護鏁版嵁
filter_logs = FILTER raw_logs BY not (browser matches '.*pingdom.*');
--item_logs = FOREACH raw_logs GENERATE browser;

--percent 500 logs
--閲嶅畾涔夋暟鎹」錛屾暟鎹泦鍙彇2欏箂tatus,month
reitem_percent_500_logs = FOREACH filter_logs GENERATE status,DATE_EXTRACT_MM(time) as month;
--鍒嗙粍鏁版嵁闆嗭紝姝ゆ椂鐨勬暟鎹粨鏋勪負(fù)MAP(a{(aa,bb,cc),(dd,ee,ff)},b{(bb,cc,dd),(ff,gg,hh)})
group_month_percent_500_logs = GROUP reitem_percent_500_logs BY (month);
--閲嶅畾涔夊垎緇勬暟鎹泦鏁版嵁欏癸紝榪涜鍒嗙粍緇熻錛屾鏃惰鑱斿悎鍒嗙粍鏁版嵁闆嗗拰鍘熸暟鎹泦緇熻
final_month_500_logs = FOREACH group_month_percent_500_logs 
{
    --瀵瑰師鏁版嵁闆嗗仛count錛屽洜涓烘槸鍦╢oreachj閲屽仛count鐨勶紝鍗充嬌鏄鍘熸暟鎹泦錛屼篃浼?xì)鑷姩浼?xì)鍔爉onth==group鐨勬潯浠?br />    --浠庤繖閲屽彲浠ョ湅鍑哄浜巊roup閲岀殑鏁版嵁闆嗭紝瀹屽叏娌$敤鍒?br />    --榪欐椂鏄互姣忎竴琛屼負(fù)鍗曚綅鐨勶紝緇熻MAP涓殑KEY-a瀵瑰簲鐨勬暟緇勫湪鍘熸暟鎹泦涓殑涓暟
    total = COUNT(reitem_percent_500_logs);
    --瀵瑰師鏁版嵁闆嗗仛filter錛屽洜涓烘槸鍦╢oreachj閲屽仛count鐨勶紝鍗充嬌鏄鍘熸暟鎹泦錛屼篃浼?xì)鑷姩浼?xì)鍔爉onth==group鐨勬潯浠?br />    --閲嶆柊榪囨護涓涓嬪師鏁版嵁闆嗭紝寰楀埌status==500,month==group鐨勬暟鎹泦
    t = filter reitem_percent_500_logs by status== 500; --create a bag which contains only T values
    --閲嶅畾涔夋暟鎹」錛屽彇group錛岀粺璁$粨鏋?br />    generate flatten(group) as col1, 100*(double)COUNT(t)/(double)total;
}
STORE final_month_500_logs into '$OUTPUT' using PigStorage(',');



paulwong 2013-04-13 15:21 鍙戣〃璇勮
]]>
鎶婂懡浠よ涓殑鍊間紶榪汸IG涓?/title><link>http://www.aygfsteel.com/paulwong/archive/2013/04/10/397645.html</link><dc:creator>paulwong</dc:creator><author>paulwong</author><pubDate>Wed, 10 Apr 2013 07:32:00 GMT</pubDate><guid>http://www.aygfsteel.com/paulwong/archive/2013/04/10/397645.html</guid><wfw:comment>http://www.aygfsteel.com/paulwong/comments/397645.html</wfw:comment><comments>http://www.aygfsteel.com/paulwong/archive/2013/04/10/397645.html#Feedback</comments><slash:comments>0</slash:comments><wfw:commentRss>http://www.aygfsteel.com/paulwong/comments/commentRss/397645.html</wfw:commentRss><trackback:ping>http://www.aygfsteel.com/paulwong/services/trackbacks/397645.html</trackback:ping><description><![CDATA[<a target="_blank">http://wiki.apache.org/pig/ParameterSubstitution<br /> <br /> <br /> </a> <div> <div style="background-color:#eeeeee;font-size:13px;border:1px solid #CCCCCC;padding-right: 5px;padding-bottom: 4px;padding-left: 4px;padding-top: 4px;width: 98%;word-break:break-all"><!--<br /> <br /> Code highlighting produced by Actipro CodeHighlighter (freeware)<br /> http://www.CodeHighlighter.com/<br /> <br /> -->%pig -param input=/user/paul/sample.txt -param output=/user/paul/output/</div> </div><br /><br />PIG涓幏鍙?br /><div style="background-color:#eeeeee;font-size:13px;border:1px solid #CCCCCC;padding-right: 5px;padding-bottom: 4px;padding-left: 4px;padding-top: 4px;width: 98%;word-break:break-all"><!--<br /><br />Code highlighting produced by Actipro CodeHighlighter (freeware)<br />http://www.CodeHighlighter.com/<br /><br />-->records = LOAD <span style="color: #800080; ">$input</span>;</div><img src ="http://www.aygfsteel.com/paulwong/aggbug/397645.html" width = "1" height = "1" /><br><br><div align=right><a style="text-decoration:none;" href="http://www.aygfsteel.com/paulwong/" target="_blank">paulwong</a> 2013-04-10 15:32 <a href="http://www.aygfsteel.com/paulwong/archive/2013/04/10/397645.html#Feedback" target="_blank" style="text-decoration:none;">鍙戣〃璇勮</a></div>]]></description></item><item><title>PIG涓殑鍒嗙粍緇熻鐧懼垎姣?/title><link>http://www.aygfsteel.com/paulwong/archive/2013/04/10/397642.html</link><dc:creator>paulwong</dc:creator><author>paulwong</author><pubDate>Wed, 10 Apr 2013 06:13:00 GMT</pubDate><guid>http://www.aygfsteel.com/paulwong/archive/2013/04/10/397642.html</guid><wfw:comment>http://www.aygfsteel.com/paulwong/comments/397642.html</wfw:comment><comments>http://www.aygfsteel.com/paulwong/archive/2013/04/10/397642.html#Feedback</comments><slash:comments>0</slash:comments><wfw:commentRss>http://www.aygfsteel.com/paulwong/comments/commentRss/397642.html</wfw:commentRss><trackback:ping>http://www.aygfsteel.com/paulwong/services/trackbacks/397642.html</trackback:ping><description><![CDATA[<a target="_blank">http://stackoverflow.com/questions/15318785/pig-calculating-percentage-of-total-for-a-field<br /><br /></a><a target="_blank">http://stackoverflow.com/questions/13476642/calculating-percentage-in-a-pig-query</a><img src ="http://www.aygfsteel.com/paulwong/aggbug/397642.html" width = "1" height = "1" /><br><br><div align=right><a style="text-decoration:none;" href="http://www.aygfsteel.com/paulwong/" target="_blank">paulwong</a> 2013-04-10 14:13 <a href="http://www.aygfsteel.com/paulwong/archive/2013/04/10/397642.html#Feedback" target="_blank" style="text-decoration:none;">鍙戣〃璇勮</a></div>]]></description></item><item><title>CombinedLogLoaderhttp://www.aygfsteel.com/paulwong/archive/2013/04/08/397510.htmlpaulwongpaulwongMon, 08 Apr 2013 03:28:00 GMThttp://www.aygfsteel.com/paulwong/archive/2013/04/08/397510.htmlhttp://www.aygfsteel.com/paulwong/comments/397510.htmlhttp://www.aygfsteel.com/paulwong/archive/2013/04/08/397510.html#Feedback0http://www.aygfsteel.com/paulwong/comments/commentRss/397510.htmlhttp://www.aygfsteel.com/paulwong/services/trackbacks/397510.html
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
 * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License at
 * 
 * 
http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License is
 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 
*/

package org.apache.pig.piggybank.storage.apachelog;

import java.util.regex.Pattern;

import org.apache.pig.piggybank.storage.RegExLoader;

/**
 * CombinedLogLoader is used to load logs based on Apache's combined log format, based on a format like
 * 
 * LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
 * 
 * The log filename ends up being access_log from a line like
 * 
 * CustomLog logs/combined_log combined
 * 
 * Example:
 * 
 * raw = LOAD 'combined_log' USING org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader AS
 * (remoteAddr, remoteLogname, user, time, method, uri, proto, status, bytes, referer, userAgent);
 * 
 
*/

public class CombinedLogLoader extends RegExLoader {
    // 1.2.3.4 - - [30/Sep/2008:15:07:53 -0400] "GET / HTTP/1.1" 200 3190 "-"
    
// "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1"
    private final static Pattern combinedLogPattern = Pattern
        .compile("^(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+.(\\S+\\s+\\S+).\\s+\"(\\S+)\\s+(.+?)\\s+(HTTP[^\"]+)\"\\s+(\\S+)\\s+(\\S+)\\s+\"([^\"]*)\"\\s+\"(.*)\"$");

    public Pattern getPattern() {
        return combinedLogPattern;
    }
}


paulwong 2013-04-08 11:28 鍙戣〃璇勮
]]>
Analyzing Apache logs with Pig http://www.aygfsteel.com/paulwong/archive/2013/04/08/397489.htmlpaulwongpaulwongSun, 07 Apr 2013 18:06:00 GMThttp://www.aygfsteel.com/paulwong/archive/2013/04/08/397489.htmlhttp://www.aygfsteel.com/paulwong/comments/397489.htmlhttp://www.aygfsteel.com/paulwong/archive/2013/04/08/397489.html#Feedback0http://www.aygfsteel.com/paulwong/comments/commentRss/397489.htmlhttp://www.aygfsteel.com/paulwong/services/trackbacks/397489.html

Analyzing log files, churning them and extracting meaningful information is a potential use case in Hadoop. We don’t have to go in for MapReduce programming for these analyses; instead we can go for tools like Pig and Hive for this log analysis. I’d just give you a start off on the analysis part. Let us consider Pig for apache log analysis. Pig has some built in libraries that would help us load the apache log files into pig and also some cleanup operation on string values from crude log files. All the functionalities are available in the piggybank.jar mostly available under pig/contrib/piggybank/java/ directory. As the first step we need to register this jar file with our pig session then only we can use the functionalities in our Pig Latin
1.       Register PiggyBank jar
REGISTER /usr/lib/pig/contrib/piggybank/java/piggybank.jar;
Once we have registered the jar file we need to define a few functionalities to be used in our Pig Latin. For any basic apache log analysis we need a loader to load the log files in a column oriented format in pig, we can create a apache log loader as
2.       Define a log loader
DEFINE ApacheCommonLogLoader org.apache.pig.piggybank.storage.apachelog.CommonLogLoader();
(Piggy Bank has other log loaders as well)
In apache log files the default format of date is ‘dd/MMM/yyyy:HH:mm:ss Z’ . But such a date won’t help us much in case of log analysis we may have to extract date without time stamp. For that we use DateExtractor()
3.       Define Date Extractor
DEFINE DayExtractor org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');
Once we have the required functionalities with us we need to first load the log file into pig
4.       Load apachelog file into pig
--load the log files from hdfs into pig using CommonLogLoader
logs = LOAD '/userdata/bejoys/pig/p01/access.log.2011-01-01' USING ApacheCommonLogLoader AS (ip_address, rfc, userId, dt, request, serverstatus, returnobject, referersite, clientbrowser);
Now we are ready to dive in for the actual log analysis. There would be multiple information you need to extract out of a log; we’d see a few of those common requirements out here
Note: you need to first register the jar, define the classes to be used and load the log files into pig before trying out any of the pig latin below
Requirement 1: Find unique hits per day
PIG Latin
--Extracting the day alone and grouping records based on days
grpd = GROUP logs BY DayExtractor(dt) as day;
--looping through each group to get the unique no of userIds
cntd = FOREACH grpd
{
                tempId =  logs.userId;
                uniqueUserId = DISTINCT tempId;
                GENERATE group AS day,COUNT(uniqueUserId) AS cnt;
}
--sorting the processed records based on no of unique user ids in descending order
srtd = ORDER cntd BY cnt desc;
--storing the final result into a hdfs directory
STORE srtd INTO '/userdata/bejoys/pig/ApacheLogResult1';
Requirement 1: Find unique hits to websites (IPs) per day
PIG Latin
--Extracting the day alone and grouping records based on days and ip address
grpd = GROUP logs BY (DayExtractor(dt) as day,ip_address);
--looping through each group to get the unique no of userIds
cntd = FOREACH grpd
{
                tempId =  logs.userId;
                uniqueUserId = DISTINCT tempId;
                GENERATE group AS day,COUNT(uniqueUserId) AS cnt;
}
--sorting the processed records based on no of unique user ids in descending order
srtd = ORDER cntd BY cnt desc;
--storing the final result into a hdfs directory
STORE srtd INTO '/userdata/bejoys/pig/ ApacheLogResult2 ';
Note: When you use pig latin in grunt shell we need to know a few factors
1.       When we issue a pig statement in grunt and press enter only the semantic check is being done, no execution is triggered.
2.       All the pig statements are executed only after the STORE command is submitted, ie map reduce programs would be triggered only after STORE is submitted
3.       Also in this case you don’t have to load the log files again and again to pig once it is loaded we can use the same for all related operations in that session. Once you are out of the grunt shell the loaded files are lost, you’d have to perform the register and log file loading steps all over again.


paulwong 2013-04-08 02:06 鍙戣〃璇勮
]]>
PIG灝忚http://www.aygfsteel.com/paulwong/archive/2013/04/05/397411.htmlpaulwongpaulwongFri, 05 Apr 2013 13:33:00 GMThttp://www.aygfsteel.com/paulwong/archive/2013/04/05/397411.htmlhttp://www.aygfsteel.com/paulwong/comments/397411.htmlhttp://www.aygfsteel.com/paulwong/archive/2013/04/05/397411.html#Feedback0http://www.aygfsteel.com/paulwong/comments/commentRss/397411.htmlhttp://www.aygfsteel.com/paulwong/services/trackbacks/397411.html浠涔堟槸PIG
鏄竴縐嶈璁¤璦錛岄氳繃璁捐鏁版嵁鎬庝箞嫻佸姩錛岀劧鍚庣敱鐩稿簲鐨勫紩鎿庡皢姝ゅ彉鎴怣APREDUCE JOB鍘籋ADOOP涓繍琛屻?/div>
PIG涓嶴QL
涓よ呮湁鐩稿悓涔嬪錛屾墽琛屼竴涓垨澶氫釜璇彞錛岀劧鍚庡嚭鏉ヤ竴浜涚粨鏋溿?/div>
浣嗕笉鍚岀殑鏄紝SQL瑕佸厛鎶婃暟鎹鍒拌〃涓墠鑳芥墽琛岋紝SQL涓嶅叧蹇冧腑闂村浣曞仛錛屽嵆鍙戜竴涓猄QL璇彞榪囧幓錛屽氨鏈夌粨鏋滃嚭鏉ャ?/div>
PIG錛屾棤欏誨鏁版嵁鍒拌〃涓紝浣嗚璁捐鐩村埌鍑虹粨鏋滅殑涓棿榪囩▼錛屾楠ゅ浣曠瓑絳夈?/div>

paulwong 2013-04-05 21:33 鍙戣〃璇勮
]]>PIG璧勬簮http://www.aygfsteel.com/paulwong/archive/2013/04/05/397406.htmlpaulwongpaulwongFri, 05 Apr 2013 10:19:00 GMThttp://www.aygfsteel.com/paulwong/archive/2013/04/05/397406.htmlhttp://www.aygfsteel.com/paulwong/comments/397406.htmlhttp://www.aygfsteel.com/paulwong/archive/2013/04/05/397406.html#Feedback0http://www.aygfsteel.com/paulwong/comments/commentRss/397406.htmlhttp://www.aygfsteel.com/paulwong/services/trackbacks/397406.html http://guoyunsky.iteye.com/blog/1317084

http://guoyunsky.iteye.com/category/196632

Hadoop瀛︿範(fàn)絎旇(9) Pig綆浠?br /> http://www.distream.org/?p=385


[hadoop緋誨垪]Pig鐨勫畨瑁呭拰綆鍗曠ず渚?br /> http://blog.csdn.net/inkfish/article/details/5205999


Hadoop and Pig for Large-Scale Web Log Analysis
http://www.devx.com/Java/Article/48063


Pig瀹炴垬
http://www.cnblogs.com/xuqiang/archive/2011/06/06/2073601.html


[鍘熷垱]Apache Pig涓枃鏁欑▼錛堣繘闃訛級
http://www.codelast.com/?p=4249


鍩轟簬hadoop騫沖彴鐨刾ig璇█瀵筧pache鏃ュ織緋葷粺鐨勫垎鏋?br /> http://goodluck-wgw.iteye.com/blog/1107503


!!Pig璇█
http://hi.baidu.com/cpuramdisk/item/a2980b78caacfa3d71442318


Embedding Pig In Java Programs
http://wiki.apache.org/pig/EmbeddedPig


涓涓猵ig浜嬩緥(REGEX_EXTRACT_ALL, DBStorage錛岀粨鏋滃瓨榪涙暟鎹簱)
http://www.myexception.cn/database/1256233.html


Programming Pig
http://ofps.oreilly.com/titles/9781449302641/index.html


[鍘熷垱]Apache Pig鐨勪竴浜涘熀紜姒傚康鍙婄敤娉曟葷粨錛?錛?br /> http://www.codelast.com/?p=3621


!PIG鎵嬪唽
http://pig.apache.org/docs/r0.11.1/func.html#built-in-functions

paulwong 2013-04-05 18:19 鍙戣〃璇勮
]]>
主站蜘蛛池模板: 郁南县| 泰宁县| 旌德县| 玉山县| 横峰县| 扬州市| 荣成市| 卓资县| 三台县| 苏尼特左旗| 简阳市| 水富县| 仙居县| 镇巴县| 广饶县| 洛南县| 大埔县| 定州市| 寿光市| 建宁县| 丹东市| 江都市| 乌兰县| 宁晋县| 同德县| 柞水县| 阳西县| 寿宁县| 舒兰市| 绿春县| 兰州市| 吉水县| 桐柏县| 宿迁市| 马尔康县| 恩施市| 乌什县| 民县| 安塞县| 义乌市| 简阳市|