简体   繁体   中英

Python Scraping/Parsing with BeautifulSoup

I'm trying to scrape a url with BeautifulSoup/Requests, and then clean it by pulling out just the sections I need. After having decided on a different target url, it outputs the HTML correctly, but my code for cleaning it is not working. Here is my code:

import requests
from bs4 import BeautifulSoup
import bs4.element
import pprint

def connection(url):
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5)'}
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text)
    return soup

def scrape_metacritic(soup,movie_list=[]):
    for mlist in get_modules(soup).items():
        for movie in mlist:
            try:
                m = parse_movie_li(movie)
            except:
                continue
            #m['release_type']=release_type
            movie_list.append(m)
    return movie_list


def just_tags(templist):
    tags = [t for t in templist if isinstance(t,bs4.element.Tag)]
    return tags 



def get_modules(soup):
        module = soup.find(class_='body_wrap') #body_wrap
        module_dict = {}
        for mod in module.find_all('li', class_='product'): 
            movie_lis = just_tags(mod.find('ul').contents)
            module_dict[mod]=movie_lis
        return module_dict

    get_modules(soup)

That part works. Here is the url:

url = 'http://www.metacritic.com/browse/movies/title/dvd/a?view=detailed'
soup = connection(url)

This is some of what I'm getting after the scrape:

        {<li class="product limited_release_product has_small_image"><div class="wrap product_wrap"><div class="product_basics stats"><div class="basic_stats has_score"><div class="main_stats"><div class="basic_stat product_title"><h3 class="product_title"><a href="/movie/a-birders-guide-to-everything">A Birder's Guide to Everything</a></h3></div><a class="basic_stat product_score" href="/movie/a-birders-guide-to-everything">
     <span class="metascore_w medium movie positive">61</span>
     </a></div> <div class="more_stats extended_stats">
     <ul class="more_stats">
     <li class="stat release_date">
     <span class="label">Release Date:</span>
     <span class="data">March 21, 2014</span>
     </li>
     <li class="stat rating">
     <span class="label">Rated:</span>
     <span class="data">
    ..
    <span class="data">136 min</span>
     </li>]}

Now I try to clean it with this:

from dateutil import parser

def parse_movie_li(li):
    title_div = li.find(class_='product_title')

    movie = {
        'title':title_div.text.strip(),
        'rel_url':title_div.find('a')['href'],
        'release_date':get_release_date(li.find(class_='release_date').find(class_='data')),
        'metascore_w':get_metascore_w(li.find(class_='metascore_w')),
        'user_score':get_user_score(li.find(class_='product avg_userscore').find(class_='data')), #add func
        'genre':get_genre(li.find(class_='genre').find(class_='data')), #add func
        'star_cast':get_star_cast(li.find(class_='cast').find(class_='data')), #add func
        'runtime':get_runtime(li.find(class_='runtime').find(class_='data')) #add func
    }
    #print movie,'\n'
    return movie

def get_metascore_w(div):
    try:
        score = div.text
    except:
        print 'no text in metascore div'
        return None
    try: 
        score = int(score)
    except: 
        pass
    return score

def get_release_date(div):
    try: 
        datestr = div.text
    except:
        return None
    try: 
        date = parser.parse(datestr)
    except:
        return datestr
    return date

def get_user_score(div):
    try:
        uscore = div.text
    except:
        print 'no text in userscore div'
        return None
    try: 
        uscore = int(uscore)
    except: 
        pass
    return uscore

def get_genre(div):
    try:
        genre = div.text
    except:
        print 'no text in genre div'
        return None
    try: 
        genre
    except: 
        pass
    return score

def get_star_cast(div):
    try:
        cast = div.text
    except:
        print 'no text in cast div'
        return None
    try: 
        cast
    except: 
        pass
    return cast

def get_runtime(div):
    try:
        runtime = div.text.strip(' min')
    except:
        print 'no text in runtime div'
        return None
    try: 
        runtime = int(runtime)
    except: 
        pass
    return runtime

It should be outputting in this form:

[{'metascore_w': 28,
  'rel_url': '/movie/mortdecai',
  'release_date': datetime.datetime(2015, 1, 23, 0, 0),
  'release_type': u'Wide releases now in theaters',
  'title': u'Mortdecai'},
 {'metascore_w': 24,
  'rel_url': '/movie/strange-magic',
  'release_date': datetime.datetime(2015, 1, 23, 0, 0),
  'release_type': u'Wide releases now in theaters',
  'title': u'Strange Magic'},
..
{'metascore_w': u'tbd',
  'rel_url': '/movie/20-once-again',
  'release_date': datetime.datetime(2015, 1, 16, 0, 0),
  'release_type': u'Limited releases now in theaters',
  'title': u'20 Once Again'}]

However, I'm getting this:

{<li class="product limited_release_product has_small_image alt"><div class="wrap product_wrap"><div class="product_basics stats"><div class="basic_stats has_score"><div class="main_stats"><div class="basic_stat product_title"><h3 class="product_title"><a href="/movie/a-family-thing">A Family Thing</a></h3></div><a class="basic_stat product_score" href="/movie/a-family-thing">
<span class="metascore_w medium movie positive">71</span>
</a></div> <div class="more_stats extended_stats">
<ul class="more_stats">
<li class="stat release_date">
<span class="label">Release Date:</span>
<span class="data">March 29, 1996</span>
</li>..

It is unparsed. Any guidance on what I'm doing incorrectly with the parse_movie_li function?

The error is actually really simple. In the parse_movie_li() function you are calling a find method on "li" when you're not allowed to. I'm not exactly sure where you're calling the method or what variable you're putting into it. But wherever you get "li" I would chain .find(class_='product_title') to that part of the function. You can however target the children of it like so: li.div.b to get the b tags in the div tags in the li tags.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM