[英]Python scraper not writing correctly to csv
我是 Python 和编程的新手,但我的项目有点问题。 我正在尝试抓取网站的数据并将其保存在 csv 中。 我工作,但是当我将“lst”列表写入“Image URL”和“Image Featured”时,括号“[”和“]”和“””也被写入csv文件。有没有办法删除这个?我知道这是因为“lst”列表包含其他带有 url 的列表。
import csv
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import date
today = date.today()
source = requests.get('https://www.meklarin.fo/').text
soup = BeautifulSoup(source, 'lxml')
df = pd.read_csv(r'C:\Users\username\Desktop\Kassin.fo\kassin\blog\management\commands\test.csv')
print(df.to_string())
original_house_title_list = []
original_house_link_list = []
house_titles_list = []
house_asking_price_list = []
house_current_bid_price_list = []
house_link_list = []
product = 'product'
current_date = today.strftime("%m.%d.%y")
house_image_list = []
house_location_list = []
lst = []
lst1 = []
house_info_list = []
house_final_info = []
list_convert = []
for house_link in soup.find_all('a', class_='house-air-content'):
house_link = house_link.get('href')
house_link_list.append(house_link.strip())
print(house_link.strip())
for house_link in house_link_list:
if house_link in original_house_link_list:
continue
else:
source = requests.get(house_link).text
soup = BeautifulSoup(source, 'lxml')
for house_titles in soup.find_all('div', class_='ogn-base-info'):
house_title = house_titles.h1.text
house_titles_list.append(house_title)
#print(house_title)
for house__asking_price in soup.find_all('div', class_='col-xs-12 col-sm-12 col-md-6 house-ask-price house-price-column'):
house_asking_price = house__asking_price.text
house_asking_price = str(house_asking_price)
house_asking_price = house_asking_price.removeprefix('Prísuppskotkr.')
house_asking_price = house_asking_price.replace('.','')
house_asking_price_list.append(house_asking_price.strip())
#print(house_asking_price.strip())
for house__current_bid_price in soup.find_all('div', class_='col-xs-12 col-sm-12 col-md-6 house-bid-price house-price-column'):
house_current_bid_price = house__current_bid_price.h3.text
house_current_bid_price = str(house_current_bid_price)
house_current_bid_price = house_current_bid_price.replace('.','')
house_current_bid_price = house_current_bid_price.replace('kr','')
house_current_bid_price_list.append(house_current_bid_price.strip())
print(house_current_bid_price.strip())
for house_all_images in soup.find_all('a'):
if 'https://www.meklarin.fo/wp-content/uploads' in str(house_all_images):
house_all_images = house_all_images.get('href')
house_image_list.append(house_all_images)
#print(house_all_images)
else:
continue
lst.append(house_image_list)
lst1.append(lst)
house_image_list=[]
for house_build_year in soup.find_all('div', class_='house-info-box-value'):
if 'Trýst her' in str(house_build_year):
continue
else:
print(house_build_year.text)
for house_info in soup.find_all('div', class_='house-desc-comp'):
house_info = house_info.text
house_info = str(house_info)
house_info = house_info.replace('Upplýsingar um bústaðin','')
house_info_list.append(house_info)
#print(house_info)
house_final_info.append(house_info)
house_info_list = []
dict = {'Title': house_titles_list, 'Content': house_final_info, 'Date':current_date, 'Post Type': product, 'Price': house_asking_price_list, 'Regular Price': house_asking_price_list, 'Sale Price':house_asking_price_list, 'Stock Status': 'instock', 'Image URL': lst, 'Image Title': house_titles_list, 'Image Featured': lst}
df = pd.DataFrame(dict)
df.to_csv('test.csv')
print(len(house_titles_list))
print(len(house_asking_price_list))
print(len(lst))
print(len(house_final_info))
要删除(示例)图像 URL 的单元格中的列表,请在写入文件之前尝试:
df['Image URL'] = [','.join(map(str, i)) for i in df['Image URL']]
可以复制上面的行并将上面的Image URL
更改为Image Featured
以清理另一列中的列表。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.