[英]Parse data using BeautifulSoup in python
我需要從一個網站解析數據: http : //www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html
BeautifulSoup 的大部分教程都是用於解析鏈接,而不是從鏈接中深入解析所需數據。
現在我瀏覽了 python 的 BeautifulSoup 模塊的一些教程,並編寫了這個腳本來下載所需的數據字符串
<div id="content_box">
<div id="content" class="hfeed">...
我正在使用的腳本:
from BeautifulSoup import BeautifulSoup
import urllib2
def main():
url = "http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html"
data = urllib2.urlopen(url).read()
bs = BeautifulSoup(data)
postdata = bs.find('div', {'id': 'content_box'})
postdata= [s.getText().strip() for s in postdata.findAll('div', {'class':'scdetail'})]
fname = 'postdata.txt'
with open(fname, 'w') as outf:
outf.write('\n'.join(postdata))
if __name__=="__main__":
main()
但是這個腳本沒有執行我所期望的。 我想明智地將發布數據放入文件中:
職位:國家電子和信息技術研究所副中心經理職位空缺 - 昌迪加爾
分中心經理
國立電子信息技術研究所
地址:NIELIT,昌迪加爾 SCO:114-116 Sector 17B
郵政編碼:160017
昌迪加爾市等......
請幫助或建議。
謝謝
你的問題在這里: postdata.findAll('div', {'class': 'scdetail'})
。 當您在尋找div
,頁面具有span
。 將其更改為postdata.findAll('span', {'class': 'scdetail'})
導致非空結果。
您要讀取的值之一的示例:
<div class="scheading">
"Pay Scale: " <span class="scdetail" itemProp="baseSalary">Rs. 15,000/-</span>
</div>
這個 pyparsing 提取器將挑選出匹配的 div/span 標簽:
from pyparsing import makeHTMLTags, withAttribute, SkipTo
"""
sample:
<div class="scheading">Postal Code: <span class="scdetail"
itemprop="postalCode">160017</span></div>
"""
div,divEnd = makeHTMLTags("div")
span,spanEnd = makeHTMLTags("span")
div.setParseAction(withAttribute(("class","scheading")))
span.setParseAction(withAttribute(("class","scdetail")))
patt = (div + SkipTo(span)("label") + span + SkipTo(spanEnd)("value") +
spanEnd + divEnd)
attrs = {}
for match in patt.searchString(html):
attrs[match.itemprop] = (match.label[0].strip(), match.value)
from pprint import pprint
pprint(attrs.items())
印刷:
[('skills',
('Desired Skills:',
'Preference will be given to candidates having good knowledge of UNIX & Visual FoxPro.')),
('qualifications',
('Qualifications:',
'\x91A\x92 level of DOEACC / PGDCA with 2 years experience. ')),
('educationRequirements',
('Educational Requirements:',
'B. E. / B. Tech. (CS / IT / Electronics) / MCA / M. Sc. (CS / IT / Electronics) / \x91B\x92 level of DOEACC ')),
('addressLocality', ('City', 'Chandigarh')),
('addressRegion', ('State', 'Haryana and Punjab')),
('streetAddress', ('Address:', 'NIELIT, Chandigarh SCO: 114-116 Sector 17B')),
('postalCode', ('Postal Code:', '160017')),
('baseSalary', ('Pay Scale:', 'Rs. 15,000/-'))]
此解決方案使用 BeautifulSoup
import os
import sys
# Import System libraries
import re
import urllib2
# Import Custom libraries
from BeautifulSoup import BeautifulSoup, Tag
job_location = lambda x: x.name == "div" and set([(u"id", u"content")]) <= set(x.attrs)
job_title_location = lambda x: set([(u"class", u"schema_title"), (u"itemprop", u"title")]) <= set(x.attrs)
organ_location = lambda x: set([(u"class", u"schema_hiringorganization"), (u"itemprop", u"name")]) <= set(x.attrs)
details_key_location = lambda x: x.name == "div" and bool(re.search("s.*heading", dict(x.attrs).get(u"class", "")))
def coll_up(ilist,base=0,count=0):
'''
Recursively collapse nested lists at depth base and above
'''
tlist = []
if(isinstance(ilist,list) or isinstance(ilist,tuple)):
for q in ilist:
tlist += coll_up(q,base,count+1)
else:
if(base > count):
tlist = ilist
else:
tlist = [ilist]
return [tlist] if((count != 0) and (base > count)) else tlist
def info_extract(ilist, count=0):
'''
Recursively walk a nested list and upon finding a non iterable, return its string
'''
tlist = []
if(isinstance(ilist, list)):
for q in ilist:
if(isinstance(q, Tag)):
tlist += info_extract(q.contents, count+1)
else:
extracted_str = q.strip()
if(extracted_str):
tlist += [extracted_str]
return [tlist] if(count != 0) else tlist
def main():
url = "http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html"
data = urllib2.urlopen(url).read()
soup = BeautifulSoup(data)
job_tags = soup.findAll(job_location)
if(job_tags):
job_tag = job_tags[0]
job_title = info_extract(job_tag.findAll(job_title_location))[0]
organ = info_extract(job_tag.findAll(organ_location))[0]
details = coll_up(info_extract(job_tag.findAll(details_key_location)), 2)
combined_dict = dict([tuple(["Job Title:"] + job_title)] + [tuple(["Organisation:"] + organ)] + [tuple(detail) for detail in details])
combined_list = [["Job Title:"] + job_title, ["Organisation:"] + organ] + details
postdata = [" ".join(x) for x in combined_list]
print postdata
fname = "postdata.txt"
with open(fname, "w") as outf:
outf.write("\n".join(postdata).encode("utf8"))
if __name__=="__main__":
main()
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.