简体   繁体   中英

Appending scraped data to CSV file

I have been messing around with python in the past few days and while following Edmund Martin's tutorial I ran into a problem:

I would like to append the name and title that I scraped to a CSV file. Only problem is the data I scraped does not appear into the file.

Could you explain to me the logic of why only "rank" "description" and "title" is being written onto the CSV file and not the actual data. Also how can I solve that?

Below is the code I have found from the tutorial website with the last three lines that I added:

import requests
from bs4 import BeautifulSoup
import time
import csv 

USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
              'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 '
              'Safari/537.36'}


def fetch_results(search_term, number_results, language_code):
    assert isinstance(search_term, str), 'Search term must be a string'
    assert isinstance(number_results, int), 'Number of results must be an integer'
    escaped_search_term = search_term.replace(' ', '+')

    google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(
        escaped_search_term, number_results, language_code)
    response = requests.get(google_url, headers=USER_AGENT)
    response.raise_for_status()

    return search_term, response.text


def parse_results(html, keyword):
    soup = BeautifulSoup(html, 'html.parser')

    found_results = []
    rank = 1
    result_block = soup.find_all('div', attrs={'class': 'g'})
    for result in result_block:

        link = result.find('a', href=True)
        title = result.find('h3', attrs={'class': 'r'})
        description = result.find('span', attrs={'class': 'st'})
        if link and title:
            link = link['href']
            title = title.get_text()
            description = description.get_text()
            if link != '#':
                found_results.append({
                    'rank': rank,
                    'title': title,
                    'description': description
                })
                rank += 1
    return found_results


def scrape_google(search_term, number_results, language_code):
    try:
        keyword, html = fetch_results(search_term, number_results, language_code)
        results = parse_results(html, keyword)
        return results
    except AssertionError:
        raise Exception("Incorrect arguments parsed to function")
    except requests.HTTPError:
        raise Exception("You appear to have been blocked by Google")
    except requests.RequestException:
        raise Exception("Appears to be an issue with your connection")


if __name__ == '__main__':
    keywords = ['python']
    data = []
    for keyword in keywords:
        try:
            results = scrape_google(keyword,2, "en")
            for result in results:
                data.append(result)
        except Exception as e:
            print(e)
        finally:
            time.sleep(1)
print(data)

with open('python_scrape.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(data)

csvFile.close()import requests
from bs4 import BeautifulSoup
import time
import csv 

USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
              'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 '
              'Safari/537.36'}


def fetch_results(search_term, number_results, language_code):
    assert isinstance(search_term, str), 'Search term must be a string'
    assert isinstance(number_results, int), 'Number of results must be an integer'
    escaped_search_term = search_term.replace(' ', '+')

    google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(
        escaped_search_term, number_results, language_code)
    response = requests.get(google_url, headers=USER_AGENT)
    response.raise_for_status()

    return search_term, response.text


def parse_results(html, keyword):
    soup = BeautifulSoup(html, 'html.parser')

    found_results = []
    rank = 1
    result_block = soup.find_all('div', attrs={'class': 'g'})
    for result in result_block:

        link = result.find('a', href=True)
        title = result.find('h3', attrs={'class': 'r'})
        description = result.find('span', attrs={'class': 'st'})
        if link and title:
            link = link['href']
            title = title.get_text()
            description = description.get_text()
            if link != '#':
                found_results.append({
                    'rank': rank,
                    'title': title,
                    'description': description
                })
                rank += 1
    return found_results


def scrape_google(search_term, number_results, language_code):
    try:
        keyword, html = fetch_results(search_term, number_results, language_code)
        results = parse_results(html, keyword)
        return results
    except AssertionError:
        raise Exception("Incorrect arguments parsed to function")
    except requests.HTTPError:
        raise Exception("You appear to have been blocked by Google")
    except requests.RequestException:
        raise Exception("Appears to be an issue with your connection")


if __name__ == '__main__':
    keywords = ['python']
    data = []
    for keyword in keywords:
        try:
            results = scrape_google(keyword,2, "en")
            for result in results:
                data.append(result)
        except Exception as e:
            print(e)
        finally:
            time.sleep(1)
print(data)

with open('python_scrape.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(data)

csvFile.close()

Thanks for the help!

def parse_results(html, keyword):
    # code ....
    for result in result_block:

        link = result.find('a', href=True) # here you get links
        title = result.find('h3', attrs={'class': 'r'}) # here you get title
        description = result.find('span', attrs={'class': 'st'}) # here you get description

        # if you want something to search here
        # for example you can print(result) here an see what data have result variable 
        # and after that parse that data and save in variable for example
        # body = result.find('h1', attrs={'class': 'h1'})


        if link and title:
            link = link['href']
            title = title.get_text()
            description = description.get_text()

            # here we take text from that body 
            # body = body.get_text()

            if link != '#':
                found_results.append({
                    'rank': rank,
                    'title': title,
                    'description': description,

                    # and here we append to list
                    'body': body
                })
                rank += 1
    return found_results

Because you're using csv.writer.writerows (which ends in 's', rows is plural), rather than writerow, csv writer expects a list of "iterable objects", which it will treat as rows.

Your main() function uses scrape_google() to return a list of dictionaries, which are all like {'rank': rank, 'title': title, 'description': description}.

Python iterates through dictionaries by returning each key, so what writerows is seeing is just the keys "rank", "title", and "description" in each row.

The fastest way to fix what is happening is to add a line

results = [[j[i] for i in j] for j in results]

before your "with open('python_scrape.csv'..." line. This uses list comprehension, which is a good thing to learn about as a new python user.

A better way to fix your code would be to make sure that it is building up a list of lists to be written to the csv instead of a list of dictionaries.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM