在Hadoop 2.2.0中打开缓存文件

[英]Opening cache files in Hadoop 2.2.0

After adding a cache file to a job with job.addCacheFile() and pulling it down with my mapper with context.getCacheFiles() . 使用job.addCacheFile()将缓存文件添加到作业后,使用带有context.getCacheFiles()映射器将其拉下来。 How can I open the cache file. 如何打开缓存文件。 I've tried using: 我尝试过使用:

BufferedReader reader = new BufferedReader(new FileReader(filename)); (commented below) (评论如下)

where filename is the toString() of the URI , but I get an IOException saying the file doesn't exist. 其中filename是URItoString() ,但是我得到一个IOException,说该文件不存在。 Can anyone help me out? 谁能帮我吗?

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.chain.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.mapreduce.lib.reduce.*;

public class UFOLocation2
    public static class MapClass extends Mapper<LongWritable, Text, Text, LongWritable>
        private final static LongWritable one = new LongWritable(1);
        private static Pattern locationPattern = Pattern.compile("[a-zA-Z]{2}[^a-zA-Z]*$");

    private Map<String, String> stateNames;

    public void setup(Context context)
            URI[] cacheFiles = context.getCacheFiles();
        catch (IOException ioe)
            System.err.println("Error reading state file.");

    public void map(LongWritable key, Text value, Context context) 
                    throws IOException, InterruptedException
        String line = value.toString();
        String[] fields = line.split("\t");
        String location = fields[2].trim();

        if (location.length() >= 2)
            Matcher matcher = locationPattern.matcher(location);
            if (matcher.find())
                int start = matcher.start();
                String state = location.substring(start, start + 2);
                context.write(new Text(lookupState(state.toUpperCase())), one);

    private void setupStateMap(String filename) throws IOException
        Map<String, String> states = new HashMap<String, String>();
        // the following line causes an IOException
        BufferedReader reader = new BufferedReader(new FileReader(filename));
        String line = reader.readLine();
        while (line != null)
            String[] split = line.split("\t");
            states.put(split[0], split[1]);
            line = reader.readLine();
        stateNames = states;

    private String lookupState(String state)
        String fullName = stateNames.get(state);
        return fullName == null ? "Other" : fullName;

public static void main(String[] args) throws Exception
    Configuration config = new Configuration();
    Job job = Job.getInstance(config, "UFO Location 2");

    job.addCacheFile(new URI("/user/kevin/data/states.txt"));


    Configuration mapconf1 = new Configuration(false);
    ChainMapper.addMapper(job, UFORecordValidationMapper.class, LongWritable.class, 
                Text.class, LongWritable.class, Text.class, mapconf1);

    Configuration mapconf2 = new Configuration(false);
    ChainMapper.addMapper(job, MapClass.class, LongWritable.class, 
                Text.class, Text.class, LongWritable.class, mapconf2);


    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);               

Is this a code issue or a configuration issue? 这是代码问题还是配置问题? I'm running on a supposedly psuedo-distributed cluster 我正在运行一个据称是伪造的分布式集群

you can use something like this 你可以使用这样的东西

    Path path = new Path(uri[0].getPath().toString());
    if (fileSystem.exists(path)) {
        FSDataInputStream dataInputStream = fileSystem.open(path);

        byte[] data = new byte[1024];
        while (dataInputStream.read(data) > 0) {
            //do your stuff here


