简体   繁体   中英

Python, Beautiful soup, how to extract data and print to csv file

So I have been working on this for a while and I cannot seem to find an answer to it or figure it out. So I am extracting data from steam and I need to figure out how to get the platforms, for example mac and turn it into a number (string number). For example if a game supports mac it will show up in my list as a "1" but if it does not it will show up as a "0". I am having the problem of the code only running once and making it all to "1".

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import timedelta
import datetime
import time
import csv
my_url = 'https://store.steampowered.com/search/?specials=1&page=1'

#opening up connectin, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

#html parsing
page_soup = soup(page_html, "html.parser")

#grab products
containers = page_soup.findAll("div", {"class":"responsive_search_name_combined"})

filename = "products.csv"
f = open(filename, "w", encoding='UTF-8')
headers = "Titles, Release_date, Discount, Price before, Price after, Positive review, Reviewers, Win, Lin, Osx, Time \n"
f.write(headers)
#f.write(headers)
#len(containers)
#containers[1]
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print(st)

for container in containers:
    titles_container = container.findAll("span",{"class":"title"})
    titl = titles_container[0].text
    print(titl)

    product_container = container.findAll("div",{"class":"search_released"})
    product_date = product_container[0].text
    print(product_date)

    product_discount_container = container.findAll("div",{"class":"search_discount"})
    product_discount = product_discount_container[0].text
    print(product_discount)

    product_price_container_before = container.findAll("div",{"class":"search_price"})
    product_price_before = product_price_container_before[0].text
    test = re.findall('(\d+\W)',product_price_before)
    testing = test[0] + test[1]
    print(testing)

    product_price_container_after = container.findAll("div",{"class":"discounted"})
    for product_price_after in product_price_container_after:
        product_price_after.find("span").extract()
        print(product_price_after.text)

    product_review_container = container.findAll("span",{"class":"search_review_summary"})
    for product_review in product_review_container:
        prr = product_review.get('data-tooltip-html')
        a = re.findall('(\d+%)|(\d+\d+)',prr)
        c = a[1][1]
        print(c)


    product_platform_container = container.findAll("span",{"class":"platform_img"})
    for product_platform in product_platform_container:
        platform = product_platform.get('class')[1]
        platt = re.findall('(\Aw)',platform)
        plattt = re.findall('(\Am)',platform)
        platttt = re.findall('(\Al)',platform)
        print(platt)
        print(plattt)
        print(platttt)

        for p in plattt:
            if "m" in p:
                macken = "1"    
            elif "m" not in p:
                macken = "0"
            print(macken)


    f.write(titl + "," + product_date.replace(",","") + "," + product_discount.replace("\n", "") + "," + testing.replace(",", ".") + "," + product_price_after.text.replace("\n","").replace(" ", "").replace(",",".").replace("\t\t\t\t\t\t\t","") + "," + a[0][0] + "," + c.replace(",","") + "," + y + "," + macken + "," + "blah" + "," + st + "\n")

f.close()
pd.read_csv("products.csv", error_bad_lines=False)

I am also writing it over to csv file. So when I write it to the csv file it just say 1, 1, 1, 1, 1...

I am getting the data from this page: 'https://store.steampowered.com/search/?specials=1&page=1'

I know this question is a little confusing so hopefully you can help, if there is any more code you need, let me know.

your statement was wrong that is why you getting 1, see the code below!

import requests,csv
from bs4 import BeautifulSoup


req = requests.get('https://store.steampowered.com/search/?specials=1&page=1')
soup = BeautifulSoup(req.content,'html.parser')
data = []
for platform in soup.find_all('div', attrs={'class':'col search_name ellipsis'}):
    title = platform.find('span',attrs={'class':'title'}).text
    if platform.find('span',attrs={'class':'win'}):
        win = '1'
    else:
        win = '0'

    if platform.find('span',attrs={'class':'mac'}):
        mac = '1'
    else:
        mac = '0'

    if platform.find('span',attrs={'class':'linux'}):
        linux = '1'
    else:
        linux = '0'

    data.append({
        'title':title.encode('utf-8'),
        'win':win,
        'mac':mac,
        'linux':linux})

with open('data.csv', 'w', newline='') as f:
    fields = ['title','win','mac','linux']
    writer = csv.DictWriter(f, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data) 

Here is how I would do it:

import csv

# ...

rows = []
product_platform_container = container.findAll("span",{"class":"platform_img"})
for product_platform in product_platform_container:
    platform = product_platform.get('class')[1]
    win_p = re.findall('(\Aw)',platform)
    mac_p = re.findall('(\Am)',platform)
    linux_p = re.findall('(\Al)',platform)
    print(win_p)
    print(mac_p)
    print(linux_p)
    row = {
       "linux": 1 if linux_p else 0,
       "win": 1 if win_p else 0,
       "mac": 1 if mac_p else 0
    }
    rows.append(row)

# After you parsed all entries...
fieldnames = ['mac', 'win', 'linux']
writer = csv.DictWriter(f, fieldnames=fieldnames)

writer.writeheader()
for row in rows:
    writer.writerow(row)

Explanation : After we identified platform with re ,we create csv row where mac , win and linux will have 1 only if their corresponding matches ( mac_p , win_p and linux_p ) are not empty. Here f is your opened file object. Checkout this article which shows how to work with csv files in python.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM