[英]Hadoop wordcount - reduce method print 1 file result to a single 1 line
我是Hadoop的新手,我只限于减少1行中的1个文件结果。
[filename v1,v2,v3...,v100]
我试图将其打印为ArrayWritable,结果如下:
a [255]
for[255,24]
to [255,24,10]
等等
但实际上我要打印的是
[filename 473,255, 30,....,v100]
将它们全部分组为1行,并且不要根据密钥将其分开。
输入的是一篇文章,例如(To Sherlock Holmes she is always THE woman. I have seldom heard him mention her under any other name. In his eyes she eclipses ......)
输出格式应为“ [文件名v1,v2,v3…,v100]”。
v1,v2...,v100
表示向量中的值。 我一直坚持以上述格式打印结果。 我只需要打印一次数组,但是每次重复打印一次
public class WordCount implements Tool{
private final static String[] top100Word = { "the", "be", "to", "of", "and", "a", "in", "that", "have", "i",
"it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they",
"we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so",
"up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no",
"just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see",
"other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after",
"use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any",
"these", "give", "day", "most", "us" };
private static Vector<Integer> vec = new Vector<Integer>(100);
// private static ArrayList<IntWritable> list = new ArrayList<IntWritable>();
private static StringBuilder str = new StringBuilder();
private static String[][] myArray = new String[10][100];
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
vec.setSize(100);
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, Text> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum=0;
for (IntWritable val : values) {
sum += val.get();
}
String finalVal = "";
StringBuilder sb = new StringBuilder();
sb.append("");
for(int i=0;i<top100Word.length;i++){
Text topValue = new Text(top100Word[i]);
// vec.add(i,sum);
if(key.equals(topValue)){
System.out.println("hi");
System.out.println("key"+key);
System.out.println(i);
vec.add(i,sum);
//result.set(sum); //sum= count number
// context.write(key, result);
sb.append(sum);
String strI = sb.toString();
myArray[0][i] = strI;
//list.add(i,new IntWritable(sum));
if(str.length() > 0)
{
str.append(",");
}
str.append(myArray[0][i].toString());
context.write(key, new Text(str.toString()));
// context.write(key, new Text(myArray[0][i]));
break;
}
}
}
}
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] files=new GenericOptionsParser(conf,args).getRemainingArgs();
Path dst_path = new Path(files[0]);
Path in_path1 = new Path(files[1]);
Path in_path2 = new Path(files[2]);
Path in_path3 = new Path(files[3]);
FileSystem hdfs = dst_path.getFileSystem(conf);
if (hdfs.exists(dst_path)){
hdfs.delete(dst_path, true);
};
//Job job = Job.getInstance(conf, "word count");
Job job = new Job(conf,"word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
// map output types
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// reducer output types
job.setOutputValueClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, in_path1, TextInputFormat.class, Map.class);
// MultipleInputs.addInputPath(job, in_path2, TextInputFormat.class, Map.class);
// MultipleInputs.addInputPath(job, in_path3, TextInputFormat.class, Map.class);
job.setNumReduceTasks(1);
// FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, dst_path);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int ecode = ToolRunner.run(new WordCount(), args);
for(int i=0 ;i<vec.size();i++){
System.out.println(vec.get(i)+",");
// context.write(lastkey,new MyArrayWritable(IntWritable.class, list.toArray(new IntWritable[vec.get(i)])));
}
for(int i=0;i<myArray.length;i++){
if(str.length() > 0)
{
str.append(",");
}
str.append(myArray[0][i].toString());
}
System.out.println(str);
System.exit(ecode);
}
@Override
public Configuration getConf() {
// TODO Auto-generated method stub
return null;
}
@Override
public void setConf(Configuration arg0) {
// TODO Auto-generated method stub
}
}
如果只希望文件名是单个键,则不能仅使用WordCount作为示例代码...
实际考虑一下您从映射器到减速器的输出是什么。 例如,不清楚从何处获取文件名。 理想情况下,如果可能的话,它来自映射器上下文。
映射器生成的输出当前分为多行,与它们来自哪个文件无关
(To, 1)
(Sherlock, 1)
(Holmes, 1)
(she, 1)
(she, 1)
等等。 除非您明确地将令牌小写,否则大写很重要。
我对您的建议是宁愿输出一些有关文件名的信息,以及对映射器中一行的单词求和。 文件的一行将输出一条这样的Reducer记录
(filename, To=1; Sherlock=1; Holmes=1; she=2)
文件的下一行将产生类似的记录,文件名键相同,而值文本不同。
因此,reducer将看到的是带有可迭代文本条目列表的单个文本键
(filename, ["To=1; Sherlock=1; Holmes=1; she=2", "..."])
然后,您要做的就是提取值文本,并对每个word=count
配对进行汇总。
最后,请在映射器代码中执行“ top word”检查,而不是在reducer中。 否则,您将浪费您不关心的映射器中的资源。
这是映射器的起点
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
private static final HashMap<String, Integer> counter = new HashMap<>();
private Text output = new Text();
private String mapToString(Map<String, Object> map) {
StringBuilder sb = new StringBuilder();
Iterator<Entry<String, Object>> iter = map.entrySet().iterator();
while (iter.hasNext()) {
Entry<String, Object> entry = iter.next();
sb.append(entry.getKey());
sb.append('=');
sb.append(entry.getValue().toString());
if (iter.hasNext()) {
sb.append(';').append(' ');
}
}
return sb.toString();
}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// TODO: Get filename
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (!counter.containsKey(token)) {
counter.put(token, 1);
} else {
counter.put(token, (Integer) counter.get(token) + 1);
}
}
output.set(mapToString(counter));
context.write(new Text("filename"), output);
}
}
我将由减速器实现由您自己决定。 请记住,此代码要求您extends Reducer<Text, Text, Text, Text>
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.