简体   繁体   中英

Hadoop Word count: receive the total number of words that start with the letter "c"

Heres the Hadoop word count java map and reduce source code:

In the map function, I've gotten to where I can output all the word that starts with the letter "c" and also the total number of times that word appears, but what I'm trying to do is just output the total number of words starting with the letter "c" but I'm stuck a little on getting the total number.Any help would be greatly appreciated, Thank you.

Example

My Output of what I'm getting:

could 2

can 3

cat 5

What I'm trying to get:

c-total 10

public static class MapClass extends MapReduceBase
   implements Mapper<LongWritable, Text, Text, IntWritable> {

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(LongWritable key, Text value,
                OutputCollector<Text, IntWritable> output,
                Reporter reporter) throws IOException {
  String line = value.toString();
  StringTokenizer itr = new StringTokenizer(line);
  while (itr.hasMoreTokens()) {
    word.set(itr.nextToken());
    if(word.toString().startsWith("c"){
    output.collect(word, one);
   }
  }
 } 
}


public static class Reduce extends MapReduceBase
implements Reducer<Text, IntWritable, Text, IntWritable> {

public void reduce(Text key, Iterator<IntWritable> values,
                   OutputCollector<Text, IntWritable> output,
                   Reporter reporter) throws IOException {
  int sum = 0;
  while (values.hasNext()) {
    sum += values.next().get(); //gets the sum of the words and add them together
  }
  output.collect(key, new IntWritable(sum)); //outputs the word and the number
  }
 }

Instead of

output.collect(word, one);

in your mapper, try:

output.collect("c-total", one);

Chris Gerken 's answer is right.

If you are outputing word as your key it will only help you to calculate the count of unique words starting with "c"

Not all total count of "c".

So for that you need to output a unique key from mapper.

 while (itr.hasMoreTokens()) {
            String token = itr.nextToken();
            if(token.startsWith("c")){
                word.set("C_Count");
                output.collect(word, one);
            }

        }

Here is an example using New Api

Driver class

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class WordCount {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = new Job(conf, "wordcount");
        FileSystem fs = FileSystem.get(conf);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        if (fs.exists(new Path(args[1])))
            fs.delete(new Path(args[1]), true);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.setJarByClass(WordCount.class);     
        job.waitForCompletion(true);
    }

}

Mapper class

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        String line = value.toString();
        StringTokenizer itr = new StringTokenizer(line);
        while (itr.hasMoreTokens()) {
            String token = itr.nextToken();
            if(token.startsWith("c")){
                word.set("C_Count");
                context.write(word, one);
            }

        }
    }
}

Reducer class

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {

    public void reduce(Text key, Iterable<IntWritable> values, Context context)
            throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }
        context.write(key, new IntWritable(sum));
    }
}

Simpler code for mapper:

public void map(LongWritable key, Text value,OutputCollector<Text,IntWritable> op, Reporter r)throws IOException
{
    String s = value.toString();
      for (String w : s.split("\\W+"))
       {
       if (w.length()>0)
        {
         if(w.startsWith("C")){
         op.collect(new Text("C-Count"), new IntWritable(1));        
         }
       }
  }
}
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {

public void map(LongWritable ikey, Text ivalue, Context context)
        throws IOException, InterruptedException {
    
    String line= ivalue.toString();
    String [] values = line.split(" ");
    IntWritable val=new IntWritable(1);
    for(String i:values)
    {
        String x=i.charAt(0);
        if(x=='c')
        {
        context.write(new Text("c"),val);
        }   }
}}


public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

public void reduce(Text key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {
    int sum=0;
    for (IntWritable val : values) {
                    sum=sum+val.get();
    }
    context.write(key,new IntWritable(sum));
}}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM