![](/img/trans.png)
[英]Custom Hadoop Configuration for Spark from Python (PySpark)?
[英]Changing aws credentials in hadoop configuration for pyspark during runtime after initialization of spark context
我在 Stack Overflow 上四處尋找相關問題的解決方案,但似乎這個問題相當獨特。 就上下文而言,由於公司程序,我需要每小時刷新一次 AWS 安全憑證,並且我正在努力添加新的刷新安全憑證來激發。 第一個小時一切正常(我可以從 s3 等訪問和讀取表),但在第一個小時結束並刷新憑據后,我無法成功更改我的 aws 憑據。
刷新我的 aws 憑據后,這里是我用來更新 spark 以使其使用新的 aws 憑據的代碼:
sc = spark.sparkContext
def getAWSKeys(profile):
awsCreds = {}
Config = ConfigParser.ConfigParser()
Config.read(os.path.join(os.getenv("HOME"), '.aws', 'credentials'))
if profile in Config.sections():
awsCreds["aws_access_key_id"] = Config.get(
profile, "aws_access_key_id")
awsCreds["aws_secret_access_key"] = Config.get(
profile, "aws_secret_access_key")
awsCreds["aws_session_token"] = Config.get(
profile, "aws_session_token")
return awsCreds
awsKeys = getAWSKeys(profile)
sc._jsc.hadoopConfiguration().set("fs.s3.awsAccessKeyId",
awsKeys["aws_access_key_id"])
sc._jsc.hadoopConfiguration().set("fs.s3.awsSecretAccessKey",
awsKeys["aws_secret_access_key"])
sc._jsc.hadoopConfiguration().set("fs.s3.session.token",
awsKeys["aws_session_token"])
sc._jsc.hadoopConfiguration().set("fs.s3.enableServerSideEncryption", "true")
sc._jsc.hadoopConfiguration().set("fs.s3.access.key",
awsKeys["aws_access_key_id"])
sc._jsc.hadoopConfiguration().set("fs.s3.secret.key",
awsKeys["aws_secret_access_key"])
sc._jsc.hadoopConfiguration().set("fs.s3.endpoint",
"s3.us-east-1.amazonaws.com")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId",
awsKeys["aws_access_key_id"])
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey",
awsKeys["aws_secret_access_key"])
sc._jsc.hadoopConfiguration().set("fs.s3a.session.token",
awsKeys["aws_session_token"])
sc._jsc.hadoopConfiguration().set("fs.s3a.enableServerSideEncryption", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key",
awsKeys["aws_access_key_id"])
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key",
awsKeys["aws_secret_access_key"])
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint",
"s3.us-east-1.amazonaws.com")
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId",
awsKeys["aws_access_key_id"])
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey",
awsKeys["aws_secret_access_key"])
sc._jsc.hadoopConfiguration().set("fs.s3n.session.token",
awsKeys["aws_session_token"])
sc._jsc.hadoopConfiguration().set("fs.s3n.enableServerSideEncryption", "true")
sc._jsc.hadoopConfiguration().set("fs.s3n.access.key",
awsKeys["aws_access_key_id"])
sc._jsc.hadoopConfiguration().set("fs.s3n.secret.key",
awsKeys["aws_secret_access_key"])
sc._jsc.hadoopConfiguration().set("fs.s3n.endpoint",
"s3.us-east-1.amazonaws.com")
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc.setSystemProperty("com.amazonaws.services.s3n.enableV4", "true")
sc.setSystemProperty("com.amazonaws.services.s3a.enableV4", "true")
# sc._jsc.hadoopConfiguration().set("fs.s3.aws.credentials.provider",
# "org.apache.hadoop.fs.s3.TemporaryAWSCredentialsProvider")
os.environ['AWS_ACCESS_KEY_ID'] = awsKeys["aws_access_key_id"]
os.environ['AWS_SECRET_ACCESS_KEY'] = awsKeys["aws_secret_access_key"]
os.environ['AWS_SESSION_TOKEN'] = awsKeys["aws_session_token"]
我試圖在我的方法中詳盡無遺,但遺憾的是沒有任何效果。 我得到的錯誤是:
Py4JJavaError Traceback (most recent call last)
<ipython-input-57-674174eca978> in <module>()
3 table = (
4 spark.read.option("delimiter", "|")
----> 5 .csv(f"s3n://{s3_path}/{file1}", header = True, inferSchema=True)
6 .select("col1", "col2", "col3", "col4")
7 )
/usr/lib/spark/python/pyspark/sql/readwriter.py in csv(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine)
408 if isinstance(path, basestring):
409 path = [path]
--> 410 return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
411
412 @since(1.5)
/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o12923.csv.
: com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: Bad Request (Service: Amazon S3; Status Code: 400; Error Code: 400 Bad Request; Request ID: 9A4F6DDEA3BD8AA6), S3 Extended Request ID: xg9ZiPjfV3h4rGgs5emsUiWl8xQdv0OMhK/91qdAs/iIvapWgIlWh9m1qLTGj3ODFM9MtEnuueg=
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1588)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1258)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1030)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:742)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:716)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4169)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4116)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1237)
at com.amazon.ws.emr.hadoop.fs.s3.lite.call.GetObjectMetadataCall.perform(GetObjectMetadataCall.java:24)
at com.amazon.ws.emr.hadoop.fs.s3.lite.call.GetObjectMetadataCall.perform(GetObjectMetadataCall.java:10)
at com.amazon.ws.emr.hadoop.fs.s3.lite.executor.GlobalS3Executor.execute(GlobalS3Executor.java:82)
at com.amazon.ws.emr.hadoop.fs.s3.lite.AmazonS3LiteClient.invoke(AmazonS3LiteClient.java:176)
at com.amazon.ws.emr.hadoop.fs.s3.lite.AmazonS3LiteClient.getObjectMetadata(AmazonS3LiteClient.java:94)
at com.amazon.ws.emr.hadoop.fs.s3.lite.AbstractAmazonS3Lite.getObjectMetadata(AbstractAmazonS3Lite.java:39)
at com.amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.retrieveMetadata(Jets3tNativeFileSystemStore.java:211)
at sun.reflect.GeneratedMethodAccessor42.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:191)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)
at com.sun.proxy.$Proxy36.retrieveMetadata(Unknown Source)
at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.getFileStatus(S3NativeFileSystem.java:768)
at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1430)
at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.exists(EmrFileSystem.java:311)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:359)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:348)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:344)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:348)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)
at sun.reflect.GeneratedMethodAccessor118.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
重申一下,第一個小時一切正常,但是當我刷新 aws 憑據時出現 400 Bad Request 錯誤。 我試圖將這些新的 aws 憑證添加到 spark,但我嘗試過的任何方法都沒有奏效。
我看不到這樣做的簡單方法,因為這些憑據已綁定到文件系統,然后凍結。
如果要嘗試這樣做,我將編寫自己的AWSCredentialsProvider
實現,該實現為AWS調用提供憑據。 默認鏈類似於:spark config,env vars,對EC2元數據服務的GET請求。 您可以添加一個新值, 以某種方式獲取新值。 您需要提出一種將新的會話憑據傳播到群集中每個主機的方法,盡管...辛苦了
另一件事是,知道AWS假定角色的最大壽命從1小時增加到12小時,因此,如果您可以讓IT團隊將分配給您的角色增加到12小時,那么您也許可以一整天。
請先嘗試。
ps:CSV“ inferSchema = true”的意思是“讀一次整個CSV文件只是為了確定模式”。 避免
最近偶然發現了這個; 在深入研究 hadoop-aws java 代碼后,發現:
fs.<scheme>.impl.disable.cache
(在我們的例子中是scheme=s3a
)來禁用這個緩存; 這將每次都實例化一個新的憑據提供程序,但至少您將能夠使用新憑據創建一個
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.