![](/img/trans.png)
[英]Azure: create storage account with container and upload blob to it in Python
[英]Is there a way to create a blob and upload the data to the blob container in Azure data factory?
我目前正在 Azure 數據工廠和 python 上構建數據管道。 The python script performs a simple webscrape and saves the file locally, this file is then uploaded to the respective blob container on Azure storage and into an Azure SQL database.
但是,我希望做出的解決方案是跳過任何本地存儲 - 換句話說,直接在網絡抓取 python 腳本上的數據之后 - 我想寫入 blob 容器中的新 blob 文件,而無需調用任何本地貯存。
有誰知道任何好的教程/資源/這在 Azure 數據工廠是否可行? 謝謝!
from typing import Container
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from azure.storage.blob import BlobClient
import pandas as pd
import os
import yaml
from yaml.loader import Loader
from azure.storage.blob import ContainerClient
class RightmoveScraper:
results=[]
def fetch(self,url):
print('HTTP GET reuqest to URL')
response = requests.get(url)
print(' | Status code: %s' % response.status_code)
return response
def parse(self,html):
content = BeautifulSoup(html,'lxml')
## set selectors to scrape ##
## from ws.io
cards = [card for card in content.find_all('div',{'l-searchResult is-list'})]
titles =[title.text.strip() for title in content.findAll('h2', {'class':'propertyCard-title'})] #title.text
addresses =[address.text.strip() for address in content.findAll('address',{'propertyCard-address'})]
descriptions = [description.text for description in content.findAll('span',{'data-test':'property-description'})]
prices = [price.text.strip() for price in content.find_all('div',{'propertyCard-priceValue'})]
dates = [date.text.split()[-1] for date in content.findAll('span',{'propertyCard-branchSummary-addedOrReduced'})]
agents = [agent.text.split('by')[-1].strip() for agent in content.findAll('div',{'propertyCard-branchSummary'})]
agentnumbers = [agentnumber.text for agentnumber in content.findAll('a',{'propertyCard-contactsPhoneNumber'})]
isSale = 'Sale'
totalresults = [totalresult.text for totalresult in content.findAll('span',{'searchHeader-resultCount'})]
floorplans = []
photos=[]
virtualtours=[]
for card in cards:
try:
floorplans.append(card.find('span',{'no-svg-floorplan propertyCard-moreInfoIcon'}))
photos.append(card.find('span',{'propertyCard-moreInfoNumber'}).text)
virtualtours.append(card.find('span',{'no-svg-virtualtour propertyCard-moreInfoIcon'}))
except:
floorplans.append(None)
photos.append(None)
virtualtours(None)
for index in range(0,len(floorplans)):
if floorplans[index]==None:
floorplans[index]="No floorplan"
else:
floorplans[index]="Has floorplan"
if photos[index]==None:
photos[index]="No photos"
else:
photos[index]=photos[index]
if virtualtours[index]==None:
virtualtours[index]="No virtual tour"
else:
virtualtours[index]="Has virtual tour"
for index in range(0,len(titles)):
self.results.append({
'title': titles[index],
'address': addresses[index],
'description': descriptions[index],
'prices': prices[index],
'dates': dates[index],
'agents': agents[index],
'isSale': isSale,
'floorplan?': floorplans[index],
'photos': photos[index],
'virtualtours':virtualtours[index]
})
def no_blank(fd):
try:
while True:
line = next(fd)
if len(line.strip()) != 0:
yield line
except:
return
def to_csv(self):
with open('rightmove.csv','w',newline='') as csv_file:
writer = csv.DictWriter((csv_file),fieldnames=self.results[0].keys()) #keys method returns a view objection
writer.writeheader()
for row in self.results:
if any(field.strip for field in row):
writer.writerow(row)
print('Stored results to "rightmove.csv"')
def run(self):
# response = self.fetch('https://www.rightmove.co.uk/property-for-sale/Central-London.html')
html=''
with open('res.html','r') as html_file:
for line in html_file:
html += html_file.read()
# html_file.write(response.text) #writes request into html code
self.parse(html)
self.to_csv()
self.sendToAzure()
pass
######## uplaoding files to azure ###############
def load_config(self):
dir_root = os.path.dirname(os.path.abspath(__file__))
with open(dir_root + "/config.yaml","r") as yamlfile:
return yaml.load(yamlfile, Loader=yaml.FullLoader)
def get_files(self,dir):
with os.scandir(dir) as entries:
for entry in entries:
if entry.is_file() and not entry.name.startswith('.'):
yield entry
def upload(self,files, connection_string, container_name):
Container_client = ContainerClient.from_connection_string(connection_string,container_name)
print("Uploading files to blob storage...")
for file in files:
blob_client = Container_client.get_blob_client(file.name)
with open(file.path,"rb") as data:
print(data)
blob_client.upload_blob(data)
print(f'{file.name} uploaded to blob storage')
def sendToAzure(self):
config = self.load_config()
datasets = self.get_files(config["source_folder"])
self.upload(datasets,config["azure_storage_connectionstring"],config["data_containername"])
if __name__ == '__main__':
scraper = RightmoveScraper()
scraper.run()
正如您所提到的,您不想在上傳文件時在本地創建文件傳遞數據
我將虛擬數據作為“abc”,您可以將抓取的數據傳遞給
blob_client.upload_blob(data, blob_type="BlockBlob")
這個 function
嘗試使用此代碼
from azure.storage.blob import BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string("Connection String")
# Initialise container
blob_container_client = blob_service_client.get_container_client("test")
# Get blob
dest_file_name = 'test.csv'
print("Creating the file ")
data="abc"
blob_client = blob_service_client.get_blob_client("test", dest_file_name)
blob_client.upload_blob(data, blob_type="BlockBlob")
OUTPUT
它創建 blob 並寫入數據
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.