[英]AWS Glue: Unable to parse CSV gzip to Parquet
我正在嘗試使用 AWS Glue ETL 工具將 CSV 文件解析為 Parquet。 我正在關注亞馬遜網站上的本教程。
由於我在 S3 存儲桶上有一堆 CSV 文件,因此我將它們的內容壓縮為 GZIP。 我正在使用以下 lambda 函數從 zip 下載數據,提取 CSV 文件並將其另存為我的 S3 存儲桶中的壓縮 CSV:
import json
import boto3
from botocore.vendored import requests
import zipfile
from gzip import GzipFile
from io import BytesIO
def lambda_handler(event, context):
s3 = boto3.resource('s3')
bucket_name = "my-bucket"
file_url = 'http://dados.cvm.gov.br/dados/FI/DOC/CDA/DADOS/'
file_name = "cda_fi_201801"
req = requests.get(file_url + file_name+".zip", stream=True)
data = req.raw.read()
zf = zipfile.ZipFile(BytesIO(data))
for fn in zf.namelist():
bytes = zf.read(fn).decode("windows-1252")
print ('File:', fn)
print ('has',len(bytes),'bytes')
# Choose folder name to put csv file
parts = fn.split("_")
folder = "PL"
if(parts[2] == "BLC"):
folder = "BLC_"+parts[3]
# BytesIO to not save to disk
gz_body = BytesIO()
gz = GzipFile(None, 'wb', 9, gz_body)
# Write csv bytes to gzip body
gz.write(bytes.encode('utf8'))
s3.Bucket(bucket_name).put_object(Key=folder + "/" + fn,
ContentType="text/plain",
ContentEncoding='gzip',
Body=gz_body.getvalue())
gz.close()
return {
'statusCode': 200,
}
在 blc_1 文件夾上運行 AWS Glue Crawler,我得到了以下表屬性:
{
"StorageDescriptor": {
"cols": {
"FieldSchema": [
{
"name": "tp_fundo",
"type": "string",
"comment": ""
},
{
"name": "cnpj_fundo",
"type": "string",
"comment": ""
},
{
"name": "denom_social",
"type": "string",
"comment": ""
},
{
"name": "dt_comptc",
"type": "string",
"comment": ""
},
{
"name": "tp_aplic",
"type": "string",
"comment": ""
},
{
"name": "tp_ativo",
"type": "string",
"comment": ""
},
{
"name": "emissor_ligado",
"type": "string",
"comment": ""
},
{
"name": "tp_negoc",
"type": "string",
"comment": ""
},
{
"name": "qt_venda_negoc",
"type": "double",
"comment": ""
},
{
"name": "vl_venda_negoc",
"type": "double",
"comment": ""
},
{
"name": "qt_aquis_negoc",
"type": "double",
"comment": ""
},
{
"name": "vl_aquis_negoc",
"type": "double",
"comment": ""
},
{
"name": "qt_pos_final",
"type": "double",
"comment": ""
},
{
"name": "vl_merc_pos_final",
"type": "double",
"comment": ""
},
{
"name": "vl_custo_pos_final",
"type": "string",
"comment": ""
},
{
"name": "dt_confid_aplic",
"type": "string",
"comment": ""
},
{
"name": "tp_titpub",
"type": "string",
"comment": ""
},
{
"name": "cd_isin",
"type": "string",
"comment": ""
},
{
"name": "cd_selic",
"type": "bigint",
"comment": ""
},
{
"name": "dt_emissao",
"type": "string",
"comment": ""
},
{
"name": "dt_venc",
"type": "string",
"comment": ""
}
]
},
"location": "s3://my-bucket/BLC_1/",
"inputFormat": "org.apache.hadoop.mapred.TextInputFormat",
"outputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"compressed": "true",
"numBuckets": "-1",
"SerDeInfo": {
"name": "",
"serializationLib": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
"parameters": {
"field.delim": ";"
}
},
"bucketCols": [],
"sortCols": [],
"parameters": {
"skip.header.line.count": "1",
"sizeKey": "731056",
"objectCount": "1",
"UPDATED_BY_CRAWLER": "blc-1",
"CrawlerSchemaSerializerVersion": "1.0",
"recordCount": "1884",
"averageRecordSize": "258",
"CrawlerSchemaDeserializerVersion": "1.0",
"compressionType": "gzip",
"classification": "csv",
"columnsOrdered": "true",
"areColumnsQuoted": "false",
"delimiter": ";",
"typeOfData": "file"
},
"SkewedInfo": {},
"storedAsSubDirectories": "false"
},
"parameters": {
"skip.header.line.count": "1",
"sizeKey": "731056",
"objectCount": "1",
"UPDATED_BY_CRAWLER": "blc-1",
"CrawlerSchemaSerializerVersion": "1.0",
"recordCount": "1884",
"averageRecordSize": "258",
"CrawlerSchemaDeserializerVersion": "1.0",
"compressionType": "gzip",
"classification": "csv",
"columnsOrdered": "true",
"areColumnsQuoted": "false",
"delimiter": ";",
"typeOfData": "file"
}
}
之后,我按照教程嘗試使用 Glue 自動生成的以下腳本運行 ETL 作業:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "cvm", table_name = "blc_1", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "cvm", table_name = "blc_1", transformation_ctx = "datasource0")
## @type: ApplyMapping
## @args: [mapping = [("tp_fundo", "string", "tp_fundo", "string"), ("cnpj_fundo", "string", "cnpj_fundo", "string"), ("denom_social", "string", "denom_social", "string"), ("dt_comptc", "string", "dt_comptc", "string"), ("tp_aplic", "string", "tp_aplic", "string"), ("tp_ativo", "string", "tp_ativo", "string"), ("emissor_ligado", "string", "emissor_ligado", "string"), ("tp_negoc", "string", "tp_negoc", "string"), ("qt_venda_negoc", "double", "qt_venda_negoc", "double"), ("vl_venda_negoc", "double", "vl_venda_negoc", "double"), ("qt_aquis_negoc", "double", "qt_aquis_negoc", "double"), ("vl_aquis_negoc", "double", "vl_aquis_negoc", "double"), ("qt_pos_final", "double", "qt_pos_final", "double"), ("vl_merc_pos_final", "double", "vl_merc_pos_final", "double"), ("vl_custo_pos_final", "string", "vl_custo_pos_final", "string"), ("dt_confid_aplic", "string", "dt_confid_aplic", "string"), ("tp_titpub", "string", "tp_titpub", "string"), ("cd_isin", "string", "cd_isin", "string"), ("cd_selic", "long", "cd_selic", "long"), ("dt_emissao", "string", "dt_emissao", "string"), ("dt_venc", "string", "dt_venc", "string")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("tp_fundo", "string", "tp_fundo", "string"), ("cnpj_fundo", "string", "cnpj_fundo", "string"), ("denom_social", "string", "denom_social", "string"), ("dt_comptc", "string", "dt_comptc", "string"), ("tp_aplic", "string", "tp_aplic", "string"), ("tp_ativo", "string", "tp_ativo", "string"), ("emissor_ligado", "string", "emissor_ligado", "string"), ("tp_negoc", "string", "tp_negoc", "string"), ("qt_venda_negoc", "double", "qt_venda_negoc", "double"), ("vl_venda_negoc", "double", "vl_venda_negoc", "double"), ("qt_aquis_negoc", "double", "qt_aquis_negoc", "double"), ("vl_aquis_negoc", "double", "vl_aquis_negoc", "double"), ("qt_pos_final", "double", "qt_pos_final", "double"), ("vl_merc_pos_final", "double", "vl_merc_pos_final", "double"), ("vl_custo_pos_final", "string", "vl_custo_pos_final", "string"), ("dt_confid_aplic", "string", "dt_confid_aplic", "string"), ("tp_titpub", "string", "tp_titpub", "string"), ("cd_isin", "string", "cd_isin", "string"), ("cd_selic", "long", "cd_selic", "long"), ("dt_emissao", "string", "dt_emissao", "string"), ("dt_venc", "string", "dt_venc", "string")], transformation_ctx = "applymapping1")
## @type: ResolveChoice
## @args: [choice = "make_struct", transformation_ctx = "resolvechoice2"]
## @return: resolvechoice2
## @inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(frame = applymapping1, choice = "make_struct", transformation_ctx = "resolvechoice2")
## @type: DropNullFields
## @args: [transformation_ctx = "dropnullfields3"]
## @return: dropnullfields3
## @inputs: [frame = resolvechoice2]
dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3")
## @type: DataSink
## @args: [connection_type = "s3", connection_options = {"path": "s3://my-bucket/blc_1"}, format = "parquet", transformation_ctx = "datasink4"]
## @return: datasink4
## @inputs: [frame = dropnullfields3]
datasink4 = glueContext.write_dynamic_frame.from_options(frame = dropnullfields3, connection_type = "s3", connection_options = {"path": "s3://my-bucket/blc_1"}, format = "parquet", transformation_ctx = "datasink4")
job.commit()
我收到以下錯誤:
19/03/27 19:10:07 警告 TaskSetManager:在 0.0 階段丟失任務 0.0(TID 0,ip-172-32-89-229.us-east-2.compute.internal,執行器 1):com.amazonaws .services.glue.util.FatalException:無法解析文件:cda_fi_BLC_1_201801.csv
在 com.amazonaws.services.glue.readers.JacksonReader.hasNextFailSafe(JacksonReader.scala:94) 在 com.amazonaws.services.glue.readers.JacksonReader.hasNext(JacksonReader.scala:38) 在 com.amazonaws.services.glue .hadoop.TapeHadoopRecordReader.nextKeyValue(TapeHadoopRecordReader.scala:63) 在 org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:207) 在 org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala :37) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at org.apache.spark.shuffle。 sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:148) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at org.apache.spark.scheduler.ShuffleMapTask。
不幸的是,它沒有給出可能發生什么的線索。 當 CSV 內容未壓縮時,我能夠執行 ETL,所以我最好的猜測是我在 gzip 壓縮上做錯了什么,或者缺少一些配置。
如果你知道發生了什么,我很感激一些幫助。
謝謝!
這個 TapeHadoopRecordReader 非常敏感。 我建議你這樣做: df = spark.read.csv('s3://path')
然后您可以使用fromDF()
將 df 轉換為動態幀
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.