简体   繁体   中英

how to add scraped data in csv file?

I am new to web scraping.I am scraping data from a website where i scraped first page href and then i go to each href and find the 'p tag' in class 'address-data'.i want to store one url 'p tag' data in one row and second url 'p tag' tag in second row.My data is appended in 'myUrl'.I want save data in csv file eg, address,longitudelatitude,phone,email then new line starts.

here is my code:

from bs4 import BeautifulSoup
import requests
import csv

myUrl=[]
urls = ["http://www.shaditayari.pk/s&category=326&location=266&a=true&paged{}".format(i) for i in range(1, 10)]  # make a url list and iterate over it
for url in urls:
    r = requests.get(url)
    print('idr1')
    soup = BeautifulSoup(r.text, "html.parser")
    for link in soup.find_all('a', {'main-link'}):
            iurl=link.get('href')  
            r = requests.get(iurl)
            print(iurl)
            soup = BeautifulSoup(r.content, "lxml")
            with open ('lhr.cv','wb') as file:
                divs = soup.find_all('div',attrs={"class":"address-data"})
                for div in divs:
                    myUrl.append(div.find('p').text)
                    #print(myUrl)
                    with open ('lhr.cv','w') as file:
                        writer=csv.writer(file)
                        for row in myUrl:
                                writer.writerow(row)                         

expected output:

9 Fane Road، Lahore 54000, Pakistan|1.561381309140028|74.31484723624567|042-37363901-9|gm@bestwesternlahore.com/sales@bestwesternlahore.com/  reservations@bestwesternlahore.com
1/E-3, Main Boulevard Gulberg III, Lahore|31.525700029363|74.34930089283|0305-2960614|https://www.facebook.com/pages/Zauk-Banquet-Hall/204612846290857

I've written this in Python 2 and using xpaths (because I think they're cleaner and simpler to use for webscraping), but this code will get you your list of links:

#Load required libraries
import requests
from lxml import html
import pandas as pd

#Create base URL
url = "http://www.shaditayari.pk/?s&post_type=ait-item&a=true&paged="

#First, we want to work out the number of pages to scrape. We load any page and get the largest page number
page = requests.get(url+str(1))
tree = html.fromstring(page.content)
no_pages = tree.xpath("//nav/a[last()]/text()")[0] #This comes out as a list of two - we only want the first one

#Next, we want to scrape the links to each page with the address

links = []
names = []

for i in range(1,int(no_pages)+1):
    page = requests.get(url+str(i))
    tree = html.fromstring(page.content)
    page_links = tree.xpath("//div[@class = 'item-title']/a/@href")
    page_names = tree.xpath("//a/h3/text()")
    links = links + page_links
    names = names + page_names
    print i

address links = {"Name": names,
                "URL": links}

pd.DataFrame.to_csv(u"address_links.csv")

This code needs completing, with the append , the dictionary completion, and a line to create a CSV, but it will get your details:

address_list = []
latitude_list = []
longitude_list = []
telephone_list = []
email_list = []
webpage_list = []

counter = 0

for url in address_links["URL"]:
    page = requests.get("http://www.shaditayari.pk/businesses/rizwan-beyg/")
    tree = html.fromstring(page.content)
    address = tree.xpath("//div[@itemprop = 'streetAddress']/p/text()")
    if len(address) == 0:
        address == ""
    else:
        address == address[0]

    latitude = tree.xpath("//p/meta[@itemprop = 'latitude']/@content")
    if len(latitude) == 0:
        latitude = ""
    else:
        latitude = latitude[0]

    longitude = tree.xpath("//p/meta[@itemprop = 'latitude']/@content")
    if len(longitude) == 0:
        longitude = ""
    else:
        longitude = longitude[0]

    telephone = tree.xpath("//a[@class = 'phone']/text()")
    if len(telephone) == 0:
        telephone = ""
    else:
        telephone = telephone[0]

    email = tree.xpath("//a[@itemprop = 'email']/text()")
    if len(email) == 0:
        email = ""
    else:
        email = email[0]

    webpage = tree.xpath("//a[@itemprop = 'url']/@href")
    if len(webpage) == 0:
        webpage = ""
    else:
        webpage = webpage[0]

    address_list.append(address)
    #continue for others

    counter+=1
    print counter

address_details = {"Name": names,
                  "URL": links,
                  "Address": address_list,
                   #continue for others
                  }

You might need to add in some unicode encoding before you turn it into a CSV. That's answered here .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM