[英]How do I put information not in a table into a dataframe from web scraping?
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
town_names = [f"Abbeville-Alabama",
f"Abernant-Alabama",
f"Alpine-Utah",
f"Dixon-Montana",
f"Adak-Alaska",]
for town_name in town_names:
page = requests.get(f"https://www.city-data.com/city/{town_name}.html").text
doc = BeautifulSoup(page, "html.parser")
print(town_name)
sex_population = str(doc.find(id="population-by-sex"))
(males, females) = [float(x) for x in re.findall(r"(?<=\()[0-9]+\.[0-9]+(?=\%\))", sex_population)]
print(males, females)
# poverty_level = str(doc.find(id="poverty-level"))
# broke = float(re.findall("(<?<\/b> )[0-9]*.[0-9]*", poverty_level))
# print(broke)
# religion_population = str(doc.find(id="religion"))
# atheist = float(re.findall("(?<=None<\/td><td>)[0-9,]*(?=<\/td><td>)", religion_population)[0].replace(",", ""))
# print(atheist)
total_population = str(doc.find(id="city-population"))
residents = float(re.findall("(?<=</b> )[0-9]*", total_population)[0].replace(",", ""))
print(residents)
religion_population = doc.find(id="religion").find_all('tr')
data = []
for row in religion_population:
columns = row.find_all('td')
if columns:
religion = columns[0].get_text(strip=True)
number = columns[1].get_text(strip=True).replace(",", "").replace("-","0")
print(f'religion: {religion} | number: {number}')
data.append([religion, int(number)])
df = pd.DataFrame(data, columns=['religion', 'number'])
df['percentage'] = (df['number'] / df['number'].sum()) * 100
atheist=df[df.religion == "None"].iloc[0]["percentage"]
evangelicals = df[df.religion == "Evangelical Protestant"].iloc[0]["percentage"]
print(atheist)
print(evangelicals)
education_population = doc.find(id="education-info").find_all('b')
data = []
for row in education_population:
columns = row.find_all('b')
if columns:
education = columns[0].get_text(strip=True)
ed_number = columns[1].get_text(strip=True).replace(",", "").replace("-", "0")
print(f'education: {education} | number: {ed_number}')
data.append([education, int(ed_number)])
df = pd.DataFrame(data, columns=['education', 'number'])
df['percentage'] = (df['number'] / df['number'].sum()) * 100
phds = df[df.education == "Graduate or professional degree"].iloc[0]["percentage"]
highschoolgrads = df[df.education == "High school or higher"].iloc[0]["percentage"]
print(phds)
print(highschoolgrads)
print("\n")
如何將教育信息放入數據框中? 我試圖將價值觀組織成教育水平和百分比。
也知道為什么當我試圖將貧困水平作為一個浮動的破產時,它說它不能因為它是一個列表?
在這一點上,我只是在打字,這樣 stackoverflow 就會允許我發帖,因為它認為我沒有足夠的細節。 所以...如果有人對此感興趣,我正在做一個數據挖掘/機器學習項目,它將從 HRC 的市政平等指數和有關城市得分的信息中獲取分數,並嘗試學習如何估計未得分的城市得分。 這是我的第一個機器學習項目。 我擔心我第一次選擇了太大的東西,但我真的已經致力於它。
刮刀有問題。
在教育中,在 URL 中,值似乎位於列表元素li中,而標題/標簽位於li內部的b中。 在刮板中,您只是收集b標簽,因此您缺少所需的數值。 您可以更改它以使其正常工作。 謝謝!
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.