I am doing a project that involves using HDFS for storage and Apache Spark for computation. I have a directory in HDFS which have several text files in it at same depth.I want to process all these files using Spark and store back their corresponding results back to HDFS with 1 output file for each input file.
For example - Suppose I have a directory with 1000 text files in it at same depth. I am reading all these files using wildcards
sc.wholeTextFiles(hdfs://localhost:9000/home/akshat/files/*.txt)
Then I process them using Spark and get a corresponding RDD and save that by using
result.saveAsTextFile("hdfs://localhost:9000/home/akshat/final")
But it gives me the result of all the input files in one single file and I want to get each file, process them individually and store the output of each of them individually.
What should be my next approach to achieve this ?
Thanks in advance!
You can do this by using wholeTextFiles() , Note: the below approach process files one by one.
val data = sc.wholeTextFiles("hdfs://master:port/vijay/mywordcount/")
val files = data.map { case (filename, content) => filename}
def doSomething(file: String) = {
println (file);
// your logic of processing a single file comes here
val logData = sc.textFile(file);
val numAs = logData.filter(line => line.contains("a")).count();
println("Lines with a: %s".format(numAs));
// save rdd of single file processed data to hdfs comes here
}
files.collect.foreach( filename => {
doSomething(filename)
})
where:
Update: multiple output files
/* SimpleApp.scala */
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
/* hadoop */
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
/* java */
import java.io.Serializable;
import org.apache.log4j.Logger
import org.apache.log4j.Level
/* Custom TextOutput Format */
class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
override def generateActualKey(key: Any, value: Any): Any =
NullWritable.get()
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String =
return key.asInstanceOf[String] +"-"+ name; // for output hdfs://Ouptut_dir/inputFilename-part-****
//return key.asInstanceOf[String] +"/"+ name; // for output hdfs://Ouptut_dir/inputFilename/part-**** [inputFilename - as directory of its partFiles ]
}
/* Spark Context */
object Spark {
val sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local[*]"))
}
/* WordCount Processing */
object Process extends Serializable{
def apply(filename: String): org.apache.spark.rdd.RDD[(String, String)]= {
println("i am called.....")
val simple_path = filename.split('/').last;
val lines = Spark.sc.textFile(filename);
val counts = lines.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_ + _); //(word,count)
val fname_word_counts = counts.map( x => (simple_path,x._1+"\t"+ x._2)); // (filename,word\tcount)
fname_word_counts
}
}
object SimpleApp {
def main(args: Array[String]) {
//Logger.getLogger("org").setLevel(Level.OFF)
//Logger.getLogger("akka").setLevel(Level.OFF)
// input ans output paths
val INPUT_PATH = "hdfs://master:8020/vijay/mywordcount/"
val OUTPUT_PATH = "hdfs://master:8020/vijay/mywordcount/output/"
// context
val context = Spark.sc
val data = context.wholeTextFiles(INPUT_PATH)
// final output RDD
var output : org.apache.spark.rdd.RDD[(String, String)] = context.emptyRDD
// files to process
val files = data.map { case (filename, content) => filename}
// Apply wordcount Processing on each File received in wholeTextFiles.
files.collect.foreach( filename => {
output = output.union(Process(filename));
})
//output.saveAsTextFile(OUTPUT_PATH); // this will save output as (filename,word\tcount)
output.saveAsHadoopFile(OUTPUT_PATH, classOf[String], classOf[String],classOf[RDDMultipleTextOutputFormat]) // custom output Format.
//close context
context.stop();
}
}
environment:
sample output:
[ramisetty@node-1 stack]$ hadoop fs -ls /vijay/mywordcount/output
Found 5 items
-rw-r--r-- 3 ramisetty supergroup 0 2015-06-09 03:49 /vijay/mywordcount/output/_SUCCESS
-rw-r--r-- 3 ramisetty supergroup 40 2015-06-09 03:49 /vijay/mywordcount/output/file1.txt-part-00000
-rw-r--r-- 3 ramisetty supergroup 8 2015-06-09 03:49 /vijay/mywordcount/output/file1.txt-part-00001
-rw-r--r-- 3 ramisetty supergroup 44 2015-06-09 03:49 /vijay/mywordcount/output/file2.txt-part-00002
-rw-r--r-- 3 ramisetty supergroup 8 2015-06-09 03:49 /vijay/mywordcount/output/file2.txt-part-00003
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.