![](/img/trans.png)
[英]Copy only the latest file from azure data lake store with Azure Data Factory (ADF)
[英]copying 14gb file from ftp to azure data lake store using ADF
我正在尝试使用 Azure 数据工厂将 14gb 文件从 FTP 复制到我的 azure 数据湖存储。 当我执行管道时,它开始复制文件并在半小时内复制了近 13.9 GB。
即使在运行管道 8 小时后也不会复制剩余的数据,并最终通过提供文件不可用的消息而失败。 文件不可用的原因是源团队删除了下一个文件的文件。
将积分单位增加到 250
{
"name": "job_fa",
"properties": {
"activities": [
{
"name": "set_parameters_adh_or_sch",
"description": "validate and set the parameter values based on the runtype sch or adh",
"type": "Lookup",
"dependsOn": [
{
"activity": "br_bs_loggin",
"dependencyConditions": [
"Succeeded"
]
}
],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"userProperties": [
{
"name": "CheckLookup1",
"value": "1"
}
],
"typeProperties": {
"source": {
"type": "SqlSource",
"sqlReaderStoredProcedureName": "[dbo].[usp_FeedParameters_main]",
"storedProcedureParameters": {
"FeedName_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_FeedName",
"type": "Expression"
}
},
"RunType_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_RunType",
"type": "Expression"
}
},
"SrcEnddate_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_SrcEndDate",
"type": "Expression"
}
},
"SrcStartdate_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_SrcStartDate",
"type": "Expression"
}
},
"TgtDate_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_TargetDate",
"type": "Expression"
}
},
"SrcHour_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_SrcHour",
"type": "Expression"
}
},
"TgtHour_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_TgtHour",
"type": "Expression"
}
}
}
},
"dataset": {
"referenceName": "AzureSql_cdpconfiguser",
"type": "DatasetReference"
},
"firstRowOnly": true
}
},
{
"name": "br_bs_loggin",
"description": "insert into the batch run and update the batch scheduler to started in case of sch run",
"type": "Lookup",
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"source": {
"type": "SqlSource",
"sqlReaderStoredProcedureName": "[dbo].[usp_BatchRun]",
"storedProcedureParameters": {
"FeedName_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_FeedName",
"type": "Expression"
}
},
"RunType_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_RunType",
"type": "Expression"
}
},
"Status_in": {
"type": "String",
"value": "Started"
}
}
},
"dataset": {
"referenceName": "AzureSql_cdpconfiguser",
"type": "DatasetReference"
},
"firstRowOnly": true
}
},
{
"name": "Check if file exists in target",
"type": "GetMetadata",
"dependsOn": [
{
"activity": "Copy Data WT to ADLS",
"dependencyConditions": [
"Succeeded"
]
}
],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"dataset": {
"referenceName": "AzureDataLakeStoreFile_wt_tgt_path_and_name",
"type": "DatasetReference",
"parameters": {
"TgtFilePath": "@activity('set_parameters_adh_or_sch').output.firstrow.TgtFileName_wt_dt_out",
"TgtFileName": {
"value": "@activity('set_parameters_adh_or_sch').output.firstrow.TgtFileName_wt_dt_out",
"type": "Expression"
}
}
},
"fieldList": [
"exists",
"size"
]
}
},
{
"name": "Copy Data WT to ADLS",
"type": "Copy",
"dependsOn": [
{
"activity": "set_parameters_adh_or_sch",
"dependencyConditions": [
"Succeeded"
]
}
],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"userProperties": [
{
"name": "Source",
"value": "@{activity('set_parameters_adh_or_sch').output.firstrow.SrcFilePath_wo_dt_out}/@{activity('set_parameters_adh_or_sch').output.firstrow.SrcFileName_wt_dt_out}"
},
{
"name": "Destination",
"value": "@{activity('set_parameters_adh_or_sch').output.firstrow.TgtFilePath_wt_dt_out}/@{activity('set_parameters_adh_or_sch').output.firstrow.TgtFilePath_wt_dt_out}"
}
],
"typeProperties": {
"source": {
"type": "FileSystemSource",
"recursive": true
},
"sink": {
"type": "AzureDataLakeStoreSink"
},
"enableStaging": false,
"dataIntegrationUnits": 0
},
"inputs": [
{
"referenceName": "FTP_SRC_FA",
"type": "DatasetReference",
"parameters": {
"SrcFileName": "@activity('set_parameters_adh_or_sch').output.firstrow.SrcFileName_wt_dt_out",
"SrcFilePath": "@activity('set_parameters_adh_or_sch').output.firstrow.SrcFilePath_wo_dt_out"
}
}
],
"outputs": [
{
"referenceName": "AzureDataLakeStoreFile_wt_tgt_path_and_name",
"type": "DatasetReference",
"parameters": {
"TgtFilePath": "@activity('set_parameters_adh_or_sch').output.firstrow.TgtFileName_wt_dt_out",
"TgtFileName": {
"value": "@activity('set_parameters_adh_or_sch').output.firstrow.TgtFileName_wt_dt_out",
"type": "Expression"
}
}
}
]
},
{
"name": "br_bs_update_failed",
"type": "SqlServerStoredProcedure",
"dependsOn": [
{
"activity": "Copy Data WT to ADLS",
"dependencyConditions": [
"Failed"
]
}
],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"storedProcedureName": "[dbo].[usp_BatchRunUpdate]",
"storedProcedureParameters": {
"BatchId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.Batchid_out",
"type": "Expression"
},
"type": "String"
},
"FeedID": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.FeedId_out",
"type": "Expression"
},
"type": "Int32"
},
"FeedRunId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.BatchRunId_out",
"type": "Expression"
},
"type": "Int32"
},
"Status": {
"value": "Failed",
"type": "String"
}
}
},
"linkedServiceName": {
"referenceName": "AzureSqlDatabase1_cdp_dev_sql_db_appconfig",
"type": "LinkedServiceReference"
}
},
{
"name": "If Condition1",
"type": "IfCondition",
"dependsOn": [
{
"activity": "Check if file exists in target",
"dependencyConditions": [
"Succeeded"
]
}
],
"typeProperties": {
"expression": {
"value": "@equals(activity('Check if file exists in target').output.Exists,true)",
"type": "Expression"
},
"ifFalseActivities": [
{
"name": "Stored Procedure_failed",
"type": "SqlServerStoredProcedure",
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"storedProcedureName": "[dbo].[usp_BatchRunUpdate]",
"storedProcedureParameters": {
"BatchId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.Batchid_out",
"type": "Expression"
},
"type": "String"
},
"FeedID": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.FeedId_out",
"type": "Expression"
},
"type": "Int32"
},
"FeedRunId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.BatchRunId_out",
"type": "Expression"
},
"type": "Int32"
},
"Status": {
"value": "Failed",
"type": "String"
}
}
},
"linkedServiceName": {
"referenceName": "AzureSqlDatabase1_cdp_dev_sql_db_appconfig",
"type": "LinkedServiceReference"
}
}
],
"ifTrueActivities": [
{
"name": "Stored Procedure1",
"type": "SqlServerStoredProcedure",
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"storedProcedureName": "[dbo].[usp_BatchRunUpdate]",
"storedProcedureParameters": {
"BatchId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.Batchid_out",
"type": "Expression"
},
"type": "String"
},
"FeedID": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.FeedId_out",
"type": "Expression"
},
"type": "Int32"
},
"FeedRunId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.BatchRunId_out",
"type": "Expression"
},
"type": "Int32"
},
"Status": {
"value": "Succeeded",
"type": "String"
}
}
},
"linkedServiceName": {
"referenceName": "AzureSqlDatabase1_cdp_dev_sql_db_appconfig",
"type": "LinkedServiceReference"
}
}
]
}
}
],
"parameters": {
"p_FeedName": {
"type": "String",
"defaultValue": "fa_cpsmyid_vdumcap1"
},
"p_BatchType": {
"type": "String",
"defaultValue": "RAW"
},
"p_RunType": {
"type": "String",
"defaultValue": "sch"
},
"p_SrcStartDate": {
"type": "String"
},
"p_SrcEndDate": {
"type": "String"
},
"p_TargetDate": {
"type": "String"
},
"p_SrcHour": {
"type": "String"
},
"p_TgtHour": {
"type": "String"
}
},
"variables": {
"v_StartDate": {
"type": "String"
},
"v_EndDate": {
"type": "String"
}
},
"folder": {
"name": "Batch_load"
}
},
"type": "Microsoft.DataFactory/factories/pipelines"
}
根据您的描述,我认为所有关注点都是提高传输性能。
首先, 参考数据集成单元声明, DIU
仅适用于Azure 集成运行时,而不适用于自托管集成运行时。您的源数据来自 FTP,所以即使您已经设置了最大的数据,我认为它也不受DIU
影响编号。(当然,这是官方文件中提到的,你仍然可以得到ADF团队的验证)
那么也许你可以从这个文档中得到一些提高复印性能的线索。
如: 1. 尝试使用parallelCopies
属性来指示您希望Copy Activity 使用的并行度。 但它也有一些来自语句的限制。
2.尝试将接收器数据集设置为Azure SQL Data Warehouse
,因为它似乎比 ADL 具有更好的性能。
3.尝试从源数据集中压缩文件以减小文件大小。
4.考虑使用Azure Cloud Service作为源数据集,例如Azure Blob Storage,据我所知,Azure服务之间的复制活动的性能通常更好。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.