I'm new to python and currently writing an application that scrapes data off the web. It's mostly done, there is only a little problem left with encoding. The site is encoded in ISO-8859-1
, but when I try to html.decode('iso-8859-1')
, it doesn't do anything. If you run the program, use 50000
and 50126
for PLZs and you'll see what I mean in the output. It would be awesome if someone could help me out.
import urllib.request
import time
import csv
import operator
from bs4 import BeautifulSoup
#Performs a HTTP-'POST' request, passes it to BeautifulSoup and returns the result
def doRequest(request):
requestResult = urllib.request.urlopen(request)
soup = BeautifulSoup(requestResult)
return soup
#Returns all the result links from the given search parameters
def getLinksFromSearch(plz_von, plz_bis):
database = []
links = []
#The search parameters
params = {
'name_ff': '',
'strasse_ff': '',
'plz_ff': plz_von,
'plz_ff2': plz_bis,
'ort_ff': '',
'bundesland_ff': '',
'land_ff': 'DE',
'traeger_ff': '',
'Dachverband_ff': '',
'submit2' : 'Suchen'
}
DATA = urllib.parse.urlencode(params)
DATA = DATA.encode('utf-8')
request = urllib.request.Request(
"http://www.altenheim-adressen.de/schnellsuche/suche1.cfm",
DATA)
# adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
html = doRequest(request)
h = html.decode('iso-8859-1')
soup = BeautifulSoup(h)
for link in soup.find_all('a'):
database.append(link.get('href'))
#Remove the first Element ('None') to avoid Attribute Errors
database.pop(0)
for item in database:
if item.startswith("suche"):
links.append(item)
return links
#Performs a search on the link results
def searchOnLinks(links):
adresses = []
i = 1
j = len(links)
print("Found", j, "results, collecting data.")
for item in links:
adresses.append(getContactInfoFromPage(item, i, j))
i = i + 1
time.sleep(0.1)
print("All done.")
return adresses
#A method to scrape the contact info from the search result
def getContactInfoFromPage(page, i, j):
name = ''
straße = ''
plz = ''
stadt = ''
telefon = ''
mail = ''
url = ''
data = [
#'Name',
#'Straße',
#'PLZ',
#'Stadt',
#'Telefon',
#'E-Mail',
#'Homepage'
]
request = urllib.request.Request("http://www.altenheim-adressen.de/schnellsuche/" + page)
#request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("Content-Type", "text/html;charset=UTF-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
print("(" , i , "/" , j , ") Making request...")
soup = doRequest(request)
print("Done.")
findeName = soup.findAll('b')
name = findeName[2]
name = name.string.split('>')
data.append(name[0])
straße = getFieldValue(soup, "Straße")
data.append(straße)
ort = getFieldValue(soup, "Ort")
(plz, stadt) = ort.split(' ', 1)
data.append(plz)
data.append(stadt)
telefon = getFieldValue(soup, "Telefon")
data.append(telefon)
mail = getFieldValue(soup, "EMail")
data.append(mail)
url = getFieldValue(soup, "Internetadresse")
data.append(url)
return data
#Strips the text from the given field's sibling
def getFieldValue(soup, field):
field_label = soup.find('td', text=field + ':')
return field_label.find_next_sibling('td').get_text(strip=True)
#The main input/output function
def inputOutput():
#PLZ is German for zip-code and consists of a five-digit number
#The program passes the numbers to the servers, and the server
#returns all search results between the two numbers
plz_von = input("Please enter first PLZ: ")
plz_bis = input("Please enter second PLZ: ")
links = getLinksFromSearch(plz_von, plz_bis)
#Checks if the search yielded any results
if len(links) > 0:
data = searchOnLinks(links)
file_name = input("Save as: ")
print("Writing to file...")
with open(file_name + '.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(data)
else:
print("The search yielded no results.")
inputOutput()
Your doRequest()
function returns a BeautifulSoup object , you cannot decode that object. Just use it directly:
soup = doRequest(request)
You don't need to decode the response at all; BeautifulSoup uses both hints in the HTML ( <meta>
headers) as well as statistical analysis to determine the correct input encoding.
In this case the HTML document claims it is Latin-1:
<meta name="content-type" content="text/html; charset=iso-8859-1">
The response doesn't include a character set in the Content-Type header either, so this is a case of a misconfigured server. You can force BeautifulSoup to ignore the <meta>
header with:
soup = BeautifulSoup(requestResult, from_encoding='utf8')
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.