簡體   English   中英

我想使用 python 以正確的格式將數據刮到 csv 文件中

[英]I want to scrape data into csv file with proper format using python

我為我的任務編寫了幾乎完整的代碼,但數據存儲存在一個問題。 當我只運行單頁時,我的數據很好,但是當我嘗試運行 20 頁並將數據存儲為 csv 格式時,我遇到了格式錯誤,請查看我的代碼並指導我如何修復它。 謝謝

這是我的代碼:

import requests
from bs4 import BeautifulSoup
#import pandas as pd
#import pandas as pd
import csv

def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
    return soup

def get_detail_page(soup):

     try:
        title = (soup.find('h1',class_="cdm_style",id=False).text)
     except:
         title = 'Empty Title'
     try:
         collection = (soup.find('td',id="metadata_collec").find('a').text)
     except:
         collection = "Empty Collection"
     try:
         author = (soup.find('td',id="metadata_creato").text)
     except:
         author = "Empty Author"
     try:
         abstract = (soup.find('td',id="metadata_descri").text)
     except:
         abstract = "Empty Abstract"
     try:
         keywords = (soup.find('td',id="metadata_keywor").text)
     except:
         keywords = "Empty Keywords"
     try:
         publishers = (soup.find('td',id="metadata_publis").text)
     except:
         publishers = "Empty Publishers"
     try:
         date_original = (soup.find('td',id="metadata_contri").text)
     except:
         date_original = "Empty Date original"
     try:
        date_digital = (soup.find('td',id="metadata_date").text)
     except:
        date_digital = "Empty Date digital"
     try:
        formatt = (soup.find('td',id="metadata_source").text)
     except:
        formatt = "Empty Format"
     try:
        release_statement = (soup.find('td',id="metadata_rights").text)
     except:
        release_statement = "Empty Realease Statement"
     try:
        library = (soup.find('td',id="metadata_librar").text)
     except:
        library = "Empty Library"
     try:
        date_created = (soup.find('td',id="metadata_dmcreated").text)
     except:
        date_created = "Empty date Created"
     data = {
         'Title'        : title,
         'Collection'   : collection,
         'Author'       : author,
         'Abstract'     : abstract,
         'Keywords'     : keywords,
         'Publishers'   : publishers,
         'Date_original': date_original,
         'Date_digital' : date_digital,
         'Format'       : formatt,
         'Release-st'   : release_statement,
         'Library'      : library,
         'Date_created' : date_created


     }
     return data
def get_index_data(soup):
    try:
        titles_link = soup.find_all('a',class_="body_link_11")
    except:
        titles_link = []
    else:
        titles_link_output = []
        for link in titles_link:
            try:
                item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
                if item_id:
                    titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
            except:
                continue
    return titles_link_output
def write_csv(data,url):
    with open('123.csv','a') as csvfile:
        writer = csv.writer(csvfile)
        row = [data['Title'], data['Collection'], data['Author'],
        data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
        data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
        data['Date_created'], url]
        writer.writerow(row)
def main():
    #url = "http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1"
    mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
    #get_page(url)
    products = get_index_data(get_page(mainurl))
    for product in products:
        data = get_detail_page(get_page(product))
        write_csv(data,product)
    #write_csv(data,url)


if __name__ == '__main__':
    main()

如評論中所示,您從網站檢索到的文本似乎包含空格。 您可以使用strip方法刪除這些空格。 這可以在您構建data字典時完成,

data = {                                                                   
    'Title': title.strip(),                                        
    'Collection': collection.strip(),                                   
    'Author': author.strip(),                                       
    'Abstract': abstract.strip(),                                     
    'Keywords': keywords.strip(),                                     
    'Publishers': publishers.strip(),                                   
    'Date_original': date_original.strip(),                                
    'Date_digital': date_digital.strip(),                                 
    'Format': formatt.strip(),                                      
    'Release-st': release_statement.strip(),                            
    'Library': library.strip(),                                      
    'Date_created': date_created.strip()                                  
}

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM