I am trying to get all the files and their subdirectories from a container in Azure storage account in a different subscription and the business requirement is to use the abfss url. abfss://@.dfs.core.windows.net//. I tried to import spark config for the subscription and used the below code to return the file list. Yet failed.
import os
from fnmatch import fnmatch
root_list="abfss://staging@bdoibgedpsadlssandbox.dfs.core.windows.net/staging/"
files_list = []
pattern = "*.*"
print(pattern)
for path, subdirs, files in os.walk(root_list):
for name in files:
if fnmatch(name.upper(), pattern.upper()):
files_list.append(path+"/"+name)
this prints "[]" empty list.
You can use below code for this use case.
from pyspark.sql.functions import col
from azure.storage.blob import BlockBlobService
from datetime import datetime
import os.path
account_name='accountname'
container_name ='container_name'
second_conatainer_name ='data'
account_key = 'storage-account-key'
prefix_val = second_conatainer_name+'/'
block_blob_service = BlockBlobService(account_name='%s'%(account_name), account_key='%s'%(account_key))
#block_blob_service.create_container(container_name)
generator = block_blob_service.list_blobs(container_name,prefix="%s"%(prefix_val))
report_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
Target_file = "/target2/%s.csv" % (container_name)
print(Target_file)
Target_file = open("%s"%(Target_file), 'w')
for blob in generator:
length = BlockBlobService.get_blob_properties(block_blob_service,container_name,blob.name)
last_modified = BlockBlobService.get_blob_properties(block_blob_service,container_name,blob.name).properties.last_modified
file_size = BlockBlobService.get_blob_properties(block_blob_service,container_name,blob.name).properties.content_length
blob_type = BlockBlobService.get_blob_properties(block_blob_service,container_name,blob.name).properties.blob_type
creation_time = BlockBlobService.get_blob_properties(block_blob_service,container_name,blob.name).properties.blob_tier_change_time
if file_size != 0:
line = account_name+'|'+container_name+'|'+blob.name+'|'+ str(file_size) +'|'+str(last_modified)[:10]+'|'
print(line)
Target_file.write(line+'\n')
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.