在python中使用BeautifulSoup解析數據

Question

我需要從一個網站解析數據： http : //www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html

BeautifulSoup 的大部分教程都是用於解析鏈接，而不是從鏈接中深入解析所需數據。

現在我瀏覽了 python 的 BeautifulSoup 模塊的一些教程，並編寫了這個腳本來下載所需的數據字符串

 <div id="content_box">
        <div id="content" class="hfeed">...

我正在使用的腳本：

from BeautifulSoup import BeautifulSoup
import urllib2

def main():
    url = "http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html"
    data = urllib2.urlopen(url).read()
    bs = BeautifulSoup(data)

    postdata = bs.find('div', {'id': 'content_box'})
    postdata= [s.getText().strip() for s in postdata.findAll('div', {'class':'scdetail'})]

    fname = 'postdata.txt'
    with open(fname, 'w') as outf:
        outf.write('\n'.join(postdata))

if __name__=="__main__":
    main()

但是這個腳本沒有執行我所期望的。 我想明智地將發布數據放入文件中：

職位：國家電子和信息技術研究所副中心經理職位空缺 - 昌迪加爾

分中心經理

國立電子信息技術研究所

地址：NIELIT，昌迪加爾 SCO：114-116 Sector 17B

郵政編碼：160017

昌迪加爾市等......

請幫助或建議。

謝謝

Answer 1

你的問題在這里： postdata.findAll('div', {'class': 'scdetail'}) 。 當您在尋找div ，頁面具有span 。 將其更改為postdata.findAll('span', {'class': 'scdetail'})導致非空結果。

您要讀取的值之一的示例：

<div class="scheading">
    "Pay Scale: " <span class="scdetail" itemProp="baseSalary">Rs. 15,000/-</span>
</div>

Answer 2

這個 pyparsing 提取器將挑選出匹配的 div/span 標簽：

from pyparsing import makeHTMLTags, withAttribute, SkipTo

"""
sample:
<div class="scheading">Postal Code: <span class="scdetail" 
    itemprop="postalCode">160017</span></div>
"""
div,divEnd = makeHTMLTags("div")
span,spanEnd = makeHTMLTags("span")
div.setParseAction(withAttribute(("class","scheading")))
span.setParseAction(withAttribute(("class","scdetail")))

patt = (div + SkipTo(span)("label") + span + SkipTo(spanEnd)("value") + 
            spanEnd + divEnd)

attrs = {}
for match in patt.searchString(html):
    attrs[match.itemprop] = (match.label[0].strip(), match.value)

from pprint import pprint
pprint(attrs.items())

印刷：

[('skills',
  ('Desired Skills:',
   'Preference will be given to candidates having good knowledge of UNIX &amp; Visual FoxPro.')),
 ('qualifications',
  ('Qualifications:',
   '\x91A\x92 level of DOEACC / PGDCA with 2 years experience. ')),
 ('educationRequirements',
  ('Educational Requirements:',
   'B. E. / B. Tech. (CS / IT / Electronics) / MCA / M. Sc. (CS / IT / Electronics) / \x91B\x92 level of DOEACC ')),
 ('addressLocality', ('City', 'Chandigarh')),
 ('addressRegion', ('State', 'Haryana and Punjab')),
 ('streetAddress', ('Address:', 'NIELIT, Chandigarh SCO: 114-116 Sector 17B')),
 ('postalCode', ('Postal Code:', '160017')),
 ('baseSalary', ('Pay Scale:', 'Rs. 15,000/-'))]

Answer 3

此解決方案使用 BeautifulSoup

import os
import sys

# Import System libraries
import re
import urllib2

# Import Custom libraries
from BeautifulSoup import BeautifulSoup, Tag

job_location = lambda x: x.name == "div" and set([(u"id", u"content")]) <= set(x.attrs)
job_title_location = lambda x: set([(u"class", u"schema_title"), (u"itemprop", u"title")]) <= set(x.attrs)
organ_location = lambda x: set([(u"class", u"schema_hiringorganization"), (u"itemprop", u"name")]) <= set(x.attrs)
details_key_location = lambda x: x.name == "div" and bool(re.search("s.*heading", dict(x.attrs).get(u"class", "")))

def coll_up(ilist,base=0,count=0):
    '''
    Recursively collapse nested lists at depth base and above
    '''
    tlist = []
    if(isinstance(ilist,list) or isinstance(ilist,tuple)):
        for q in ilist:
            tlist += coll_up(q,base,count+1)
    else:
        if(base > count):
            tlist = ilist
        else:
            tlist = [ilist]
    return [tlist] if((count != 0) and (base > count)) else tlist

def info_extract(ilist, count=0):
    '''
    Recursively walk a nested list and upon finding a non iterable, return its string
    '''
    tlist = []
    if(isinstance(ilist, list)):
        for q in ilist:
            if(isinstance(q, Tag)):
                tlist += info_extract(q.contents, count+1)
            else:
                extracted_str = q.strip()
                if(extracted_str):
                    tlist += [extracted_str]
    return [tlist] if(count != 0) else tlist

def main():
    url = "http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html"
    data = urllib2.urlopen(url).read()
    soup = BeautifulSoup(data)

    job_tags = soup.findAll(job_location)
    if(job_tags):
        job_tag = job_tags[0]
        job_title = info_extract(job_tag.findAll(job_title_location))[0]
        organ = info_extract(job_tag.findAll(organ_location))[0]
        details = coll_up(info_extract(job_tag.findAll(details_key_location)), 2)

        combined_dict = dict([tuple(["Job Title:"] + job_title)] + [tuple(["Organisation:"] + organ)] + [tuple(detail) for detail in details])
        combined_list = [["Job Title:"] + job_title, ["Organisation:"] + organ] + details
        postdata = [" ".join(x) for x in combined_list]
        print postdata

        fname = "postdata.txt"
        with open(fname, "w") as outf:
            outf.write("\n".join(postdata).encode("utf8"))

if __name__=="__main__":
    main()

在python中使用BeautifulSoup解析數據

問題描述

3 個解決方案

解決方案1
0 2013-06-29 03:06:53

解決方案2
0 2013-06-29 11:02:37

解決方案3
0 已采納 2013-06-30 14:13:34

在python中使用BeautifulSoup解析數據

問題描述

3 個解決方案

解決方案1 0 2013-06-29 03:06:53

解決方案2 0 2013-06-29 11:02:37

解決方案3 0 已采納 2013-06-30 14:13:34

解決方案1
0 2013-06-29 03:06:53

解決方案2
0 2013-06-29 11:02:37

解決方案3
0 已采納 2013-06-30 14:13:34