[英]pyspark structured streaming kafka - py4j.protocol.Py4JJavaError: An error occurred while calling o41.save
[英]Python Page Rank Streaming Application using Hadoop, py4j.protocol.Py4JJavaError: An error occurred while calling o27.partitions
我正在嘗試將頁面排名算法從簡單的 python 代碼傳遞到在 spark 中使用 python 的流式應用程序,在這種情況下,根據特定時間(10)秒從另一個 python 腳本中獲取輸入本來應該根據時間來拿它們並分析它們,當我運行以下代碼時出現錯誤,我不知道是什么原因導致錯誤,我試圖獲取.csv文件,
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
def main(input_folder_location):
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 10) # Streaming will execute in each 3 seconds
ssc.checkpoint(input_folder_location) # 'mean directory name, Directory to be checked
links = spark.sparkContext.textFile(input_folder_location). \
map(lambda line: line.split(',')). \
map(lambda pages: (pages[0], pages[1])). \
distinct(). \
groupByKey(). \
map(lambda x: (x[0], list(x[1])))
ranks = links.map(lambda element: (element[0], 1.0))
# iterations = int(sys.argv[3])
iterations = 4
for x in range(iterations + 1):
contribs = links.join(ranks).flatMap(lambda row: computeContribs(row[1][0], row[1][1]))
print("\n")
print("------- Iter: " + str(x) + " --------")
ranks = contribs.reduceByKey(lambda v1, v2: v1 + v2).map(lambda x: (x[0], x[1] * 0.85 + 0.15))
for rank in ranks.collect():
print(rank)
print("\n")
print("------- Final Results --------")
for rank in ranks.collect():
print(rank)
ssc.start()
ssc.awaitTermination()
def computeContribs(neighbors, rank):
for neighbor in neighbors:
yield (neighbor, rank / len(neighbors))
if __name__ == "__main__":
if len(sys.argv) < 2:
sys.stderr.write(
"Error: Usage: StreamingApp.py <input-file-directory>")
sys.exit()
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
main(sys.argv[1])
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
def computeContribs(neighbors, rank):
for neighbor in neighbors:
yield (neighbor, rank / len(neighbors))
def main(input_folder_location):
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 3) #Streaming will execute in each 3 seconds
lines = ssc.textFileStream(input_folder_location) #'log/ mean directory name
print(" hello from app")
counts = lines.map(lambda line: line.split(",")) \
.map(lambda pages:(pages[0],pages[1])) \
.transform(lambda rdd: rdd.distinct()) \
.groupByKey() \
.map(lambda x: (x[0],list(x[1])))
ranks = counts.map(lambda element:(element[0],1.0))
iterations = 5
for x in range(1):
contribs = counts.join(ranks).flatMap(lambda row: computeContribs(row[1][0],row[1][1]))
print("\n")
print(" iter --------------",x)
ranks = contribs.reduceByKey(lambda v1,v2:v1+v2)
print("\n")
#counts.pprint()
ranks.pprint()
print(" finishing the task")
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
if len(sys.argv) < 2:
sys.stderr.write(
"Error: Usage: StreamingApp.py <input-file-directory>")
sys.exit()
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
main(sys.argv[1])
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.