Scraping multiple pages from a website into a csv file

Question

I'm trying to scrape data from multiple pages of the same URL into one single csv file;)

The way this script works is by writing the url into a url.txt file, the script will read from this file then save it into a csv file

I am trying to figure out on my own, but I need the help of the bright minds of this community to please help me out

As you can see in the code, I am trying to scrape data from kakaku.com (jp website)

'''
import os
   import sys
   import csv
   import codecs
   import requests
   from bs4 import BeautifulSoup


   # scraping function for kakatu.com / old version             
   def kakaku_scraper_o(url):   
    for u in url:
        headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36         (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
        page = requests.get(u, headers = headers)
        soup = BeautifulSoup(page.content, 'html.parser')

        titles_temp = soup.find_all(class_ = "ckitemLink")
        prices_temp = soup.find_all(class_ = "pryen")
        links_temp = soup.find_all(class_ = "ckitanker")
        titles = []
        prices = []
        links = []
        for i in range(len(titles_temp)):
            links.append(links_temp[i]['href'])
            titles.append(titles_temp[i].get_text())
            prices.append(prices_temp[i].get_text())
                    
        filename = u.split("/")
        filename = filename[-2] + "_kakaku.csv"
        with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
            csvWriter = csv.writer(csvFile)
            csvWriter.writerow(["Link", "Title", "Price"])
            for i in range(len(titles)):
                csvWriter.writerow([links[i], titles[i].encode("utf8"), prices[i].encode("utf8")])
     
 
# scraping function for kakatu.com / new version             
def kakaku_scraper_n(url):   
    for u in url:
        headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
        page = requests.get(u, headers = headers)
        soup = BeautifulSoup(page.content, 'html.parser')

        titles_temp = soup.find_all(class_ = "p-list_name")
        prices_temp = soup.find_all(class_ = "p-list_price_data_price_num-1 p-num")
        links_temp = soup.find_all(class_ = 'p-list_name')
        
        titles = []
        prices = []
        links = []
        for i in range(len(titles_temp)):
            links_temp[i] = links_temp[i].find("a")
            links.append("https://kakaku.com" + str(links_temp[i]['href']))
            titles.append(titles_temp[i].get_text())
            prices.append(prices_temp[i].get_text())

        filename = u.split("/")
        filename = filename[-2] + "_kakaku.csv"
        with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
            csvWriter = csv.writer(csvFile)
            csvWriter.writerow(["Link", "Title", "Price"])
            for i in range(len(titles)):
                csvWriter.writerow([links[i], titles[i], prices[i]])


# scraping fuction for bestgate.net
def bestgate_scraper(url):
    for u in url:
        headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
        page = requests.get(u, headers = headers)
        soup = BeautifulSoup(page.content, 'html.parser')

        titles_temp = soup.find_all(class_ = "name")
        prices_temp = soup.find_all(class_ = "price")
        links_temp = soup.find_all(class_ = 'name')
        
        titles = []
        prices = []
        links = []
        for i in range(len(titles_temp)):
            links.append(links_temp[i]['href'])
            titles.append(titles_temp[i].get_text())
            prices.append(prices_temp[i].get_text())

        filename = u.split("/")
        filename = filename[-2] + "_bestgate.csv"`enter code here`
        with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
            csvWriter = csv.writer(csvFile)
            csvWriter.writerow(["Link", "Title", "Price"])
            for i in range(len(titles)):
                csvWriter.writerow([links[i], titles[i], prices[i]])


# main function
if __name__ == '__main__':
    with open("url.txt", mode='r', newline='') as urlfile:
        url = urlfile.read().splitlines()
        print(url)
        urlfile.close()
        
    # sort out the links for each website's function
    kko = []
    kkn = []
    btg = []
    for u in url:
        if not "aspx" in u:
            if "kakaku" in u:
                kkn.append(u)
        elif "aspx" and "kakaku" in u:
                kko.append(u)
        else:
            btg.append(u)
            
    bestgate_scraper(btg)
    
    kakaku_scraper_o(kko)
    
    kakaku_scraper_n(kkn)
'''

Answer 1

I don't completly understand your question. But I will make the following observation:

Mark code in the question as python code, so is easier to read
Make code as modular as possible.
Make clear with is your doubt.

If you update your question I may help you.

Scraping multiple pages from a website into a csv file

Question

1 answers

solution1
0 2020-05-07 13:18:18

Scraping multiple pages from a website into a csv file

Question

1 answers

solution1 0 2020-05-07 13:18:18

solution1
0 2020-05-07 13:18:18