I am trying to use Apache-Spark to parallelly handle the separated csv files in a directory. Specifically, I want each of the slave nodes to add all numbers of the first column in each csv files, and send back the computing result. The following is my code:
import os, sys, inspect, csv
### Current directory path.
curr_dir = os.path.split(inspect.getfile(inspect.currentframe()))[0]
### Setup the environment variables
spark_home_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../../spark-1.4.0")))
python_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
os.environ["SPARK_HOME"] = spark_home_dir
os.environ["PYTHONPATH"] = python_dir
### Setup pyspark directory path
pyspark_dir = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "../python")))
sys.path.append(pyspark_dir)
### Import the pyspark
from pyspark import SparkConf, SparkContext
### Specify the data file directory, and load the data files
data_path = os.path.realpath(os.path.abspath(os.path.join(curr_dir, "./test_dir")))
### myfunc is to add all numbers in the first column.
def myfunc(s):
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
cr = csv.reader(open(s_new,"rb"))
total = 0
for row in cr:
total += int(row[0])
return total
def main():
### Initialize the SparkConf and SparkContext
conf = SparkConf().setAppName("ruofan").setMaster("local")
sc = SparkContext(conf = conf)
datafile = sc.wholeTextFiles(data_path)
### Sent the application in each of the slave node
temp = datafile.foreach(myfunc)
### Collect the result and print it out.
for x in temp.sample(False, 1).collect():
print x
if __name__ == "__main__":
main()
But when I run the code, it shows up the error as below:
AttributeError: 'tuple' object has no attribute 'startswith'
And the stacktrace is as follow:
ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "sum.py", line 24, in myfunc
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
File "/usr/lib/python2.7/posixpath.py", line 75, in join
if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/07/14 16:52:15 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "sum.py", line 24, in myfunc
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
File "/usr/lib/python2.7/posixpath.py", line 75, in join
if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/07/14 16:52:15 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
15/07/14 16:52:15 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/07/14 16:52:15 INFO TaskSchedulerImpl: Cancelling stage 0
15/07/14 16:52:15 INFO DAGScheduler: ResultStage 0 (foreach at sum.py:40) failed in 0.408 s
15/07/14 16:52:15 INFO DAGScheduler: Job 0 failed: foreach at sum.py:40, took 0.458805 s
Traceback (most recent call last):
File "sum.py", line 47, in <module>
main()
File "sum.py", line 40, in main
temp = datafile.foreach(myfunc)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 721, in foreach
self.mapPartitions(processPartition).count() # Force evaluation
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 972, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 963, in sum
return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 771, in reduce
vals = self.mapPartitions(func).collect()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 745, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/java_gateway.py", line 538, in __call__
self.target_id, self.name)
File "/usr/local/lib/python2.7/dist-packages/py4j-0.8.2.1-py2.7.egg/py4j/protocol.py", line 300, in get_return_value
format(target_id, '.', name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 2318, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 304, in func
return f(iterator)
File "/home/ying/AWS_Tutorial/spark-1.4.0/python/pyspark/rdd.py", line 719, in processPartition
f(x)
File "sum.py", line 24, in myfunc
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
File "/usr/lib/python2.7/posixpath.py", line 75, in join
if b.startswith('/'):
AttributeError: 'tuple' object has no attribute 'startswith'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1266)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1257)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1256)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1256)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1450)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1411)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
I don't know why my code does not work. I would really appreciate if anyone can help me to solve the problem. Thanks!
The stack trace is pretty evident that the error is coming from the myfunc
line:
s_new = os.path.realpath(os.path.abspath(os.path.join(data_path, s)))
s is expected to be an array of string, but you are passing a tuple from
temp = datafile.foreach(myfunc)
where datafile
is from wholeTextFiles
, which results in an RDD[(String, String)]
. The first item in the tuple is the path, the second is the contents. So, you will need to pass in only the first part of the tuple (path) by the look of the code:
datafile.foreach(lambda (path, content): myfunc(path))
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.