Azure Data Lake 中的文件夾統計信息

Question

我試圖總結有多少數據已寫入我的數據湖中的文件夾。 做這個的最好方式是什么？ 我應該使用 U-SQL 作業嗎？ 洞察力？

Answer 1

有兩種方法可以做到這一點：

如果是一次性操作，您可以使用 Azure 存儲資源管理器 ( https://azure.microsoft.com/en-us/features/storage-explorer/ )，導航到 Data Lake Store 文件夾並獲取大小它。
如果您想要以編程方式執行此操作，Data Lake Store 提供了一個符合 WebHDFS 標准的 API，它可以列出多個文件夾屬性：GETCONTENTSUMMARY。 您可以在此處查看更多詳細信息： https ://learn.microsoft.com/en-us/rest/api/datalakestore/webhdfs-filesystem-apis。

希望這可以幫助

何塞

Answer 2

您可以使用 Python 代碼循環訪問這些文件。 請參閱此處： https ://cloudarchitected.com/2019/05/computing-total-storage-size-of-a-folder-in-azure-data-lake-storage-gen2/

如果您想快速交叉檢查：

從 Windows 應用程序https://azure.microsoft.com/en-in/features/storage-explorer/下載 Azure 存儲資源管理器

打開您要查看尺寸詳細信息的文件夾。

在頂部欄菜單中選擇更多 ->文件夾統計將幫助您獲取目錄的詳細信息，包括以字節為單位的大小。 請參閱附件 [Azure 存儲資源管理器菜單 [1] 的示例快照][1]

[1]: https://i.stack.imgur.com/R1DuZ.jpg

Answer 3

下面是有助於獲取文件夾/文件統計信息的腳本。另外請根據您的環境使用 vaules 驗證所有變量。

import csv, os, datetime,configparser
from azure.datalake.store import core,lib

# Returns the size of each subdirectory 
def getUsage(adls_client,data,level):
    
    temp=[]
    # Split the path by '/' and store in list
    for i in data:
        temp.append(i.split('/'))

    # Prepare PathList by removing the filenames 
    path=[]
    pathList=[]
    for i in temp:

        # Ensure Depth of the Path is not crossing level
        path=[]
        if len(i)-1 >= level:
            maxDepth = level
        else:
            maxDepth = len(i)-1
            
        for j in range(maxDepth):
        
            if i[j] not in path or i[j] != '_SUCCESS':
                path.append(i[j])
        
        pathList.append(path)
    
    # Remove duplicates
    uniquePaths = set(tuple(x) for x in pathList)
    pathsPreparedDU= list("/".join(x) for x in uniquePaths)
    
    # Get usage for the directories from prepared paths
    answers=[]
    temp=[]
    temp2=""
    blankLevelCnt =0 

    for i in pathsPreparedDU:
        temp=[]
        temp2=""
        usage=adls_client.du(i, deep=True, total=True)
        temp.append(i.split('/'))
        for item in temp:
            if len(item) < level+1:
                blankLevelCnt = (level+1) - len(item)
        temp2=temp2+i
        for j in range(blankLevelCnt):
            temp2=temp2+"/"
        temp2=temp2+str(usage)
        answers.append([temp2])

    # add element for CSV header
    csvList = []
    temp=[]
    temp.append("Date/Time")    
    for i in range(level):
        temp.append("Level "+str(i+1))

    temp.append("Size (Bytes)")    
    temp.append("Size (KB)")    
    temp.append("Size (MB)")    
    temp.append("Size (GB)")    
    temp.append("Size (TB)")    

    csvList.append(temp)
    now = datetime.datetime.now()

    for i in answers:
        usageBytes = int(i[0].split('/')[-1])
        usageKBytes = round(usageBytes/1024,2)
        usageMBytes = round(usageKBytes/1024,2)
        usageGBytes = round(usageMBytes/1024,2)
        usageTBytes = round(usageGBytes/1024,2)
        csvList.append((str(now)[:16]+"/"+i[0]+"/"+str(usageKBytes)+"/"+str(usageMBytes)+"/"+str(usageGBytes)+"/"+str(usageTBytes)).split('/'))

    return csvList

# Returns the alds_client object
def connectADLS(tenant_id,app_id,app_key, adls_name):
    adls_credentials = lib.auth(tenant_id=tenant_id,client_secret=app_key,client_id=app_id)
    return core.AzureDLFileSystem(adls_credentials, store_name=adls_name)

# Returns the all subdirectories under the root directory
def getSubdirectories(adls_client,root_dir):
    return adls_client.walk(root_dir)

# Write to CSV
def writeCSV(root_dir,csvList):
    
    fileprefixes = root_dir.split('/')
    prefix = "root-"
    while('' in fileprefixes) : 
        fileprefixes.remove('') 

    if len(fileprefixes) > 0:
        prefix = ""
        for i in fileprefixes:
            prefix = prefix + i + "-" 
    
    x = datetime.datetime.today().strftime('%Y-%m-%d')

    filename = prefix+""+ x +".csv"

    with open(filename, "w+") as csvFile:
        writer = csv.writer(csvFile,lineterminator='\n')
        writer.writerows(csvList)

    csvFile.close()
    print("file Generated")
    print('##vso[task.setvariable variable=filename;]%s' % (filename))

if __name__ == "__main__":

    # 1. Parse config file and get service principal details
    config = configparser.ConfigParser()
    config.sections()
    config.read('config.ini')
    
    tenant_id=config['SERVICE_PRINCIPAL']['tenant_id']
    app_id=config['SERVICE_PRINCIPAL']['app_id']
    app_key=config['SERVICE_PRINCIPAL']['app_key']
    adls_name = config['ADLS_ACCT']['adls_name'] 
    root_dir = config['ADLS_ACCT']['root_dir'] 
    level = config['ADLS_ACCT']['level'] 

    # 2. Connect to ADLS 
    adls_client = connectADLS(tenant_id,app_id,app_key, adls_name)

    # 3. recursively lists all files
    data = getSubdirectories(adls_client,root_dir)

    # 4. Get usage for the directories
    csvList = getUsage(adls_client,data,int(level))

    # 5. Upload report to blob
    writeCSV(root_dir,csvList)

Azure Data Lake 中的文件夾統計信息

問題描述

3 個解決方案

解決方案1
7 2018-03-09 02:55:20

解決方案2
1 2019-11-06 07:17:24

解決方案3
1 2020-01-06 07:46:00

Azure Data Lake 中的文件夾統計信息

問題描述

3 個解決方案

解決方案1 7 2018-03-09 02:55:20

解決方案2 1 2019-11-06 07:17:24

解決方案3 1 2020-01-06 07:46:00

解決方案1
7 2018-03-09 02:55:20

解決方案2
1 2019-11-06 07:17:24

解決方案3
1 2020-01-06 07:46:00