簡體   English   中英

有沒有辦法在 Azure 數據工廠中創建 blob 並將數據上傳到 blob 容器?

[英]Is there a way to create a blob and upload the data to the blob container in Azure data factory?

我目前正在 Azure 數據工廠和 python 上構建數據管道。 The python script performs a simple webscrape and saves the file locally, this file is then uploaded to the respective blob container on Azure storage and into an Azure SQL database.

但是,我希望做出的解決方案是跳過任何本地存儲 - 換句話說,直接在網絡抓取 python 腳本上的數據之后 - 我想寫入 blob 容器中的新 blob 文件,而無需調用任何本地貯存。

有誰知道任何好的教程/資源/這在 Azure 數據工廠是否可行? 謝謝!

from typing import Container
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from azure.storage.blob import BlobClient
import pandas as pd
import os
import yaml
from yaml.loader import Loader
from azure.storage.blob import ContainerClient 

class RightmoveScraper:
    results=[]

    def fetch(self,url):

        print('HTTP GET reuqest to URL')
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)
        
        
        return response  

    def parse(self,html):
        content = BeautifulSoup(html,'lxml')
        
        ## set selectors to scrape ##
        ## from ws.io


        cards = [card for card in content.find_all('div',{'l-searchResult is-list'})]
        titles =[title.text.strip() for title in content.findAll('h2', {'class':'propertyCard-title'})] #title.text
        addresses =[address.text.strip() for address in content.findAll('address',{'propertyCard-address'})]
        descriptions = [description.text for description in content.findAll('span',{'data-test':'property-description'})]
        prices = [price.text.strip() for price in content.find_all('div',{'propertyCard-priceValue'})]
        dates = [date.text.split()[-1] for date in content.findAll('span',{'propertyCard-branchSummary-addedOrReduced'})]
        agents = [agent.text.split('by')[-1].strip() for agent in content.findAll('div',{'propertyCard-branchSummary'})]
        agentnumbers = [agentnumber.text for agentnumber in content.findAll('a',{'propertyCard-contactsPhoneNumber'})]
        isSale = 'Sale'
        totalresults = [totalresult.text for totalresult in content.findAll('span',{'searchHeader-resultCount'})]
        floorplans = []
        photos=[]
        virtualtours=[]

        for card in cards:
            try: 
               floorplans.append(card.find('span',{'no-svg-floorplan propertyCard-moreInfoIcon'}))
               photos.append(card.find('span',{'propertyCard-moreInfoNumber'}).text)
               virtualtours.append(card.find('span',{'no-svg-virtualtour propertyCard-moreInfoIcon'}))
            
            except:
                floorplans.append(None)
                photos.append(None)
                virtualtours(None)
          
        for index in range(0,len(floorplans)):
            if floorplans[index]==None:
                floorplans[index]="No floorplan"
            else:
                floorplans[index]="Has floorplan"

            if photos[index]==None:
                photos[index]="No photos"
            else:
                photos[index]=photos[index]

            if virtualtours[index]==None:
                virtualtours[index]="No virtual tour"
            else:
                virtualtours[index]="Has virtual tour"
            

        for index in range(0,len(titles)):
            self.results.append({
                'title': titles[index],
                'address': addresses[index],
                'description': descriptions[index],
                'prices': prices[index],
                'dates': dates[index],
                'agents': agents[index],
                'isSale': isSale,
                'floorplan?': floorplans[index],
                'photos': photos[index],
                'virtualtours':virtualtours[index]
            })

    def no_blank(fd):
        try:
            while True:
                line = next(fd)
                if len(line.strip()) != 0:
                    yield line
        except:
            return

    def to_csv(self):
        with open('rightmove.csv','w',newline='') as csv_file:
            writer = csv.DictWriter((csv_file),fieldnames=self.results[0].keys()) #keys method returns a view objection
            writer.writeheader()

            for row in self.results:
                if any(field.strip for field in row):
                    writer.writerow(row)

            print('Stored results to "rightmove.csv"')

    def run(self):
        # response = self.fetch('https://www.rightmove.co.uk/property-for-sale/Central-London.html')
        html=''
        with open('res.html','r') as html_file:
            for line in html_file:
                html += html_file.read()

            # html_file.write(response.text) #writes request into html code

        self.parse(html)
        self.to_csv()
        self.sendToAzure()

        pass

######## uplaoding files to azure ###############

    def load_config(self):
        dir_root = os.path.dirname(os.path.abspath(__file__))
        with open(dir_root + "/config.yaml","r") as yamlfile:
            return yaml.load(yamlfile, Loader=yaml.FullLoader) 
    
    def get_files(self,dir):
        with os.scandir(dir) as entries:       
            for entry in entries:
                if entry.is_file() and not entry.name.startswith('.'):
                    yield entry

    def upload(self,files, connection_string, container_name):
        Container_client = ContainerClient.from_connection_string(connection_string,container_name)
        print("Uploading files to blob storage...")


        for file in files:
            blob_client = Container_client.get_blob_client(file.name)
            with open(file.path,"rb") as data:
                print(data)
                blob_client.upload_blob(data)
                print(f'{file.name} uploaded to blob storage')

    def sendToAzure(self):
        config = self.load_config()
        datasets = self.get_files(config["source_folder"])
        self.upload(datasets,config["azure_storage_connectionstring"],config["data_containername"])
     
if __name__ == '__main__':
    scraper = RightmoveScraper()
    scraper.run()
    

正如您所提到的,您不想在上傳文件時在本地創建文件傳遞數據

我將虛擬數據作為“abc”,您可以將抓取的數據傳遞給
blob_client.upload_blob(data, blob_type="BlockBlob")這個 function

嘗試使用此代碼

from azure.storage.blob import BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string("Connection String")
# Initialise container
blob_container_client = blob_service_client.get_container_client("test")
# Get blob
dest_file_name = 'test.csv'
print("Creating the file ")
data="abc"
blob_client = blob_service_client.get_blob_client("test", dest_file_name)
blob_client.upload_blob(data, blob_type="BlockBlob")

OUTPUT

它創建 blob 並寫入數據

在此處輸入圖像描述

在此處輸入圖像描述

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM