沉睡森林@漂在北京

          本處文章除注明“轉載”外均為原創,轉載請注明出處。

            BlogJava :: 首頁 :: 新隨筆 :: 聯系 :: 聚合  :: 管理 ::
            152 隨筆 :: 4 文章 :: 114 評論 :: 0 Trackbacks
          例子很簡單,我沒有運行自帶的wordcount,而是自己做了一個簡單的例子。
          實現的功能是從我們的nginx的access log里面計算url訪問的次數。
          access log文件:


          10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /bf5bd91c/css/base/base_jiexi-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /bf5bd91c/js/lib/lib-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /image/jiexi/logo.png HTTP/1.1" 304 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /bf5bd91c/js/page/jiexi/index-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /bf5bd91c/css/page/jiexi/index-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/page/jiexi/index-all-min.css HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/page/jiexi/index-min.js HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/lib/lib-min.js HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/base/base_jiexi-all-min.css HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/css/page/jiexi/index-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/js/page/jiexi/index-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/page/jiexi/index-min.js HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/page/jiexi/index-all-min.css HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/css/base/base_jiexi-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/js/lib/lib-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/css/page/jiexi/index-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/js/page/jiexi/index-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/page/jiexi/index-min.js HTTP/1.1" 200 56215 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/page/jiexi/index-all-min.css HTTP/1.1" 200 21254 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/base/base_jiexi-all-min.css HTTP/1.1" 200 22782 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
          10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/lib/lib-min.js HTTP/1.1" 200 137514 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"

          新建maven項目:
          <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
              xsi:schemaLocation
          ="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
              <modelVersion>4.0.0</modelVersion>
              <groupId>com.jiexi</groupId>
              <artifactId>jiexi-examples</artifactId>
              <version>0.0.1-SNAPSHOT</version>
              <dependencies>
                  <dependency>
                      <groupId>org.apache.hadoop</groupId>
                      <artifactId>hadoop-core</artifactId>
                      <version>1.0.2</version>
                  </dependency>
              </dependencies>
          </project>


          Mapper代碼如下:
          package com.jiexi.examples.hadoop;

          import java.io.IOException;

          import org.apache.hadoop.io.IntWritable;
          import org.apache.hadoop.io.LongWritable;
          import org.apache.hadoop.io.Text;
          import org.apache.hadoop.mapred.MapReduceBase;
          import org.apache.hadoop.mapred.Mapper;
          import org.apache.hadoop.mapred.OutputCollector;
          import org.apache.hadoop.mapred.Reporter;

          public class AccessLogMapper extends MapReduceBase implements
                  Mapper<LongWritable, Text, Text, IntWritable> {
              private final static IntWritable one = new IntWritable(1);
              private Text url = new Text();

              static String POST = "\"POST ";
              static String GET = "\"GET ";
              static String END = " HTTP/1.0";

              public void map(LongWritable key, Text value,
                      OutputCollector<Text, IntWritable> output, Reporter reporter)
                      throws IOException {
                  String line = value.toString();
                  String url1 = getUrl(line);

                  url.set(url1);
                  output.collect(url, one);
              }

              public static void main(String[] args) {
                  String a = "10.2.112.34 - - [06/Mar/2012:18:05:41 +0800] \"GET /mine?originUrl= HTTP/1.0\" 302 -";
                  String b = "10.2.112.34 - - [06/Mar/2012:15:02:42 +0800] \"POST /user/login?originUrl=http%3A%2F%2Fwww.jiexi.com%2Fhome HTTP/1.0\" 200 25";
           
          //        System.out.println(getUrl(a));
          //        System.out.println(getUrl(b));
                  
                  String s =" /user/register?originUrl=http%3A%2F%2Fwww.jiexi.com%2Fhome";
                  
                  System.out.println(s.substring(0,s.indexOf("?")));
              }

              private static String getUrl(String a) {
                  // int len = POST.length();
                  int begin = a.indexOf(POST);
                  int get = a.indexOf(GET);
                  if (get > -1) {
                      begin = get;
                      // len = GET.length();
                  }

                  int end = a.indexOf(END);

                  String url = a.substring(begin + 1, end);

                  if (url.indexOf("?") > 0) {
                      return url.substring(0, url.indexOf("?"));
                  }

                  return url;
              }

          }

          Reducer代碼如下:
          package com.jiexi.examples.hadoop;

          import java.io.IOException;
          import java.util.Iterator;

          import org.apache.hadoop.io.IntWritable;
          import org.apache.hadoop.io.Text;
          import org.apache.hadoop.mapred.MapReduceBase;
          import org.apache.hadoop.mapred.OutputCollector;
          import org.apache.hadoop.mapred.Reducer;
          import org.apache.hadoop.mapred.Reporter;

          public class AccessLogReducer extends MapReduceBase implements
                  Reducer<Text, IntWritable, Text, IntWritable> {

              public void reduce(Text key, Iterator<IntWritable> values,
                      OutputCollector<Text, IntWritable> output, Reporter reporter)
                      throws IOException {
                  int sum = 0;
                  while (values.hasNext()) {
                      sum += values.next().get();
                  }
                  output.collect(key, new IntWritable(sum));
              }

          }

          job調用
          package com.jiexi.examples.hadoop;

          import org.apache.hadoop.fs.Path;
          import org.apache.hadoop.io.IntWritable;
          import org.apache.hadoop.io.Text;
          import org.apache.hadoop.mapred.FileInputFormat;
          import org.apache.hadoop.mapred.FileOutputFormat;
          import org.apache.hadoop.mapred.JobClient;
          import org.apache.hadoop.mapred.JobConf;
          import org.apache.hadoop.mapred.TextInputFormat;
          import org.apache.hadoop.mapred.TextOutputFormat;

          public class AccessLogPerDayJob {
              public static void main(String[] args) throws Throwable {
                  JobConf jobConf = new JobConf(AccessLogPerDayJob.class);
                  jobConf.setJobName("access_log");

                  jobConf.setOutputKeyClass(Text.class);
                  jobConf.setOutputValueClass(IntWritable.class);

                  jobConf.setMapperClass(AccessLogMapper.class);
                  jobConf.setCombinerClass(AccessLogReducer.class);
                  jobConf.setReducerClass(AccessLogReducer.class);

                  jobConf.setInputFormat(TextInputFormat.class);
                  jobConf.setOutputFormat(TextOutputFormat.class);

                  FileInputFormat.addInputPath(jobConf, new Path(args[0]));
                  FileOutputFormat.setOutputPath(jobConf, new Path(args[1]));

                  JobClient.runJob(jobConf);
              }
          }
          ===============================================
          mvn clean package 
          丟到namenode的/opt/hadoop 下面

          運行:
          ./bin/hadoop dfs -mkdir access_log_in
          ./bin/hadoop dfs -ls

          #拷貝本地日志文件到hdfs中
          ./bin/hadoop dfs -put /opt/access_log/*  access_log_in

          #運行,把access_log.jar拷貝到/opt/hadoop下面
          ./bin/hadoop jar access_log.jar com.jiexi.examples.hadoop.AccessLogPerDayJob    access_log_in  access_log_out


          查看job運行情況:
          http://10.2.112.31:50030/jobtracker.jsp 

          查看data:
          ./bin/hadoop dfs -ls access_log_out




          posted on 2012-04-12 12:50 王總兵 閱讀(6843) 評論(0)  編輯  收藏

          只有注冊用戶登錄后才能發表評論。


          網站導航:
           
          主站蜘蛛池模板: 射阳县| 青州市| 潮州市| 石台县| 白水县| 上虞市| 博爱县| 尉氏县| 通山县| 吕梁市| 巴楚县| 重庆市| 平原县| 津南区| 华坪县| 孝昌县| 屯门区| 安徽省| 玉树县| 垦利县| 华坪县| 永济市| 射阳县| 鲁山县| 宣威市| 遵化市| 高平市| 肥西县| 南昌市| 宝清县| 宿州市| 安溪县| 庆安县| 潜山县| 原平市| 滦平县| 和田县| 沅陵县| 林口县| 彭水| 石门县|