Apache-Spark parallelly handles the separated csv files

Question

I am trying to use Apache-Spark to parallelly handle the separated csv files in a directory. Specifically, I want each of the slave nodes to add all numbers of the first column in each csv files, and send back the computing result. The following is my code:

import os, sys, inspect, csv

### Current directory path.
curr_dir = os.path.split(inspect.getfile(inspect.currentframe()))[0]

### Setup the environment variables
spark_home_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../../spark-1.4.0")))
python_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
os.environ["SPARK_HOME"] = spark_home_dir
os.environ["PYTHONPATH"] = python_dir

### Setup pyspark directory path
pyspark_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
sys.path.append(pyspark_dir)

### Import the pyspark
from pyspark import SparkConf, SparkContext

### Specify the data file directory, and load the data files
data_path = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "./test_dir")))

### myfunc is to add all numbers in the first column.
def myfunc(s):
    s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
    cr = csv.reader(open(s_new,"rb"))

    total = 0
    for row in cr:  
        total += int(row[0])
    return total

def main():
    ### Initialize the SparkConf and SparkContext
    conf = SparkConf().setAppName("ruofan").setMaster("local")
    sc = SparkContext(conf = conf)
    datafile = sc.wholeTextFiles(data_path)

    ### Sent the application in each of the slave node
    temp = datafile.foreach(myfunc)

    ### Collect the result and print it out.
    for x in temp.sample(False, 1).collect(): 
            print x


if __name__ == "__main__":
    main()

But when I run the code, it shows up the error as below:

AttributeError: 'tuple' object has no attribute 'startswith'

And the stacktrace is as follow:

ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "sum.py", line 24, in myfunc
    s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
  File "/usr/lib/python2.7/posixpath.py", line 75, in join
    if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)
15/07/14 16:52:15 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "sum.py", line 24, in myfunc
    s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
  File "/usr/lib/python2.7/posixpath.py", line 75, in join
    if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)

15/07/14 16:52:15 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
15/07/14 16:52:15 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
15/07/14 16:52:15 INFO TaskSchedulerImpl: Cancelling stage 0
15/07/14 16:52:15 INFO DAGScheduler: ResultStage 0 (foreach at sum.py:40) failed in 0.408 s
15/07/14 16:52:15 INFO DAGScheduler: Job 0 failed: foreach at sum.py:40, took 0.458805 s
Traceback (most recent call last):
  File "sum.py", line 47, in <module>
    main()
  File "sum.py", line 40, in main
    temp = datafile.foreach(myfunc)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 721, in foreach
    self.mapPartitions(processPartition).count()  # Force evaluation
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 972, in count
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 963, in sum
    return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 771, in reduce
    vals = self.mapPartitions(func).collect()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 745, in collect
    port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/java_gateway.py", line 538, in __call__
    self.target_id, self.name)
  File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/protocol.py", line 300, in get_return_value
    format(target_id, '.', name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
    return f(iterator)
  File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
    f(x)
  File "sum.py", line 24, in myfunc
    s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
  File "/usr/lib/python2.7/posixpath.py", line 75, in join
    if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'

    at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
    at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
    at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1266)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1257)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1256)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1256)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1450)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1411)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)

I don't know why my code does not work. I would really appreciate if anyone can help me to solve the problem. Thanks!

Answer 1

The stack trace is pretty evident that the error is coming from the myfunc line:

s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))

s is expected to be an array of string, but you are passing a tuple from

temp = datafile.foreach(myfunc)

where datafile is from wholeTextFiles , which results in an RDD[(String, String)] . The first item in the tuple is the path, the second is the contents. So, you will need to pass in only the first part of the tuple (path) by the look of the code:

datafile.foreach(lambda (path, content): myfunc(path))

Apache-Spark parallelly handles the separated csv files

Question

1 answers

solution1
0 2015-07-14 23:35:25

Apache-Spark parallelly handles the separated csv files

Question

1 answers

solution1 0 2015-07-14 23:35:25

solution1
0 2015-07-14 23:35:25