[英]Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe
import os
import sys
os.chdir("/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/bin")
os.curdir
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = '/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7'
SPARK_HOME = os.environ['SPARK_HOME']
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.9-src.zip"))
from pyspark import SparkContext
from pyspark import SparkConf
# Optionally configure Spark Settings
conf=SparkConf()
conf.set("spark.executor.memory", "1g")
conf.set("spark.cores.max", "2")
conf.setAppName("V2 Maestros")
## Initialize SparkContext. Run only once. Otherwise you get multiple
#Context Error.
sc = SparkContext('local', conf=conf)
#Test to make sure everything works.
lines=sc.textFile("auto-data.csv")
lines.count()
This is the error that occurred.这是发生的错误。 It was simple program calculating the number of entering of the file but this error came up.
这是一个计算文件输入次数的简单程序,但出现了这个错误。 I have kept the file in both the locations mention in the code even though the result is the same.
即使结果相同,我也将文件保存在代码中提到的两个位置。
Py4JJavaError Traceback (most recent call last)
<ipython-input-6-5c9242495358> in <module>()
1 lines = sc.textFile("auto-save.csv")
----> 2 lines.count()
/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in count(self)
1006 3
1007 """
-> 1008 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1009
1010 def stats(self):
/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in sum(self)
997 6.0
998 """
--> 999 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
1000
1001 def count(self):
/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in fold(self, zeroValue, op)
871 # zeroValue provided to each partition is unique from the one provided
872 # to the final reduce call
--> 873 vals = self.mapPartitions(func).collect()
874 return reduce(op, vals, zeroValue)
875
/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in collect(self)
774 """
775 with SCCallSiteSync(self.context) as css:
--> 776 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
777 return list(_load_from_socket(port, self._jrdd_deserializer))
778
/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
931 answer = self.gateway_client.send_command(command)
932 return_value = get_return_value(
--> 933 answer, self.gateway_client, self.target_id, self.name)
934
935 for temp_arg in temp_args:
/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/sql/utils.pyc in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
310 raise Py4JJavaError(
311 "An error occurred while calling {0}{1}{2}.\n".
--> 312 format(target_id, ".", name), value)
313 else:
314 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/auto-save.csv
at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:200)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
at org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:53)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:892)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:211)
at java.lang.Thread.run(Thread.java:745)
I faced same error and I solved it.我遇到了同样的错误,我解决了它。 If we configure Spark context with more cores as workers than your system supports.
如果我们将 Spark 上下文配置为具有比系统支持更多的内核作为工作线程。 Like I have 3 core system but in my code when I mentioned below code it won't work because I don't have 4th core.
就像我有 3 个核心系统,但是在我的代码中,当我在下面提到代码时,它不起作用,因为我没有第 4 个核心。
Unsupported Spark Context Configuration code for which I got Py4JJavaerror:我得到 Py4JJavaerror 的不受支持的 Spark 上下文配置代码:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Collinear Points").setMaster("local[4]") #Initialize spark context using 4 local cores as workers
sc = SparkContext(conf=conf)
from pyspark.rdd import RDD
Supported SparkContext Configuration code for all types of systems because in below we are not initializing cores explicitly as workers.支持所有类型系统的 SparkContext 配置代码,因为在下面我们没有将核心明确初始化为工作人员。
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Collinear Points")
sc = SparkContext('local',conf=conf)
from pyspark.rdd import RDD
You should save your output as您应该将输出保存为
lines=sc.textFile("hdfs:///tmp/auto-data.csv")
or just或者只是
lines=sc.textFile("/tmp/auto-data.csv")
This command would write your output to hdfs此命令会将您的输出写入 hdfs
The exception is self-explanatory.例外是不言自明的。 Try to give absolute path of
auto-save.csv
to lines=sc.textFile("auto-data.csv")
or move the auto-save.csv
to /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/
尝试将
auto-save.csv
绝对路径auto-save.csv
lines=sc.textFile("auto-data.csv")
或将auto-save.csv
移动到/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/
thonRDD.collectAndServe.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/auto-save.csv
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.