简体   繁体   中英

how to export the Scraped data into excel with pre-defined header python?

currently i am printing the data.now rather than printing i want to export to

excel./csv new to python pls help.


**data is very huge around 9000 rows with 6 columns?**
 import requests from urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) from bs4 import BeautifulSoup as bs def scrape_bid_data(): page_no = 1 #initial page number while True: print('Hold on creating URL to fetch data...') URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no) #create dynamic URL print('URL cerated: ' + URL) scraped_data = requests.get(URL,verify=False) # request to get the data soup_data = bs(scraped_data.text, 'lxml') #parse the scraped data using lxml extracted_data = soup_data.find('div',{'id':'pagi_content'}) #find divs which contains required data if len(extracted_data) == 0: # **if block** which will check the length of extracted_data if it is 0 then quit and stop the further execution of script. break else: for idx in range(len(extracted_data)): # loops through all the divs and extract and print data if(idx % 2 == 1): #get data from odd indexes only because we have required data on odd indexes bid_data = extracted_data.contents[idx].text.strip().split('\\n') print('-' * 100) print(bid_data[0]) #BID number print(bid_data[5]) #Items print(bid_data[6]) #Quantitiy Required print(bid_data[10] + bid_data[12].strip()) #Department name and address print(bid_data[16]) #Start date print(bid_data[17]) #End date print('-' * 100) page_no +=1 #increments the page number by 1 scrape_bid_data()

I think you should start by returning the extract_data object containing your data at the end of your function.

 page_no = 1
 def scrap_bid_data(page):

     print('Hold on creating URL to fetch data...')
     URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page)
     print('URL cerated: ' + URL)

     scraped_data = requests.get(URL,verify=False) # request to get the data
     soup_data = bs(scraped_data.text, 'lxml') #parse the scraped data using lxml
     extracted_data = soup_data.find('div',{'id':'pagi_content'}) 
     return extracted_data

Then use it to create a dataframe

  extract_data = scrap_bid_data(page_no)
  import pandas as pd
  df = pd.DataFrame(extract_data) 

and then export this fataframe.

 df.to_csv ('file_name_{}'.format(page_no))

here you go...

import requests
from urllib3.exceptions import InsecureRequestWarning
import csv

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs

f = csv.writer(open('gembid.csv', 'w'))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate'])


def scrap_bid_data():
    page_no = 1
    while page_no < 911:
        print('Hold on creating URL to fetch data...')
        url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no)
        print('URL created: ' + url)
        scraped_data = requests.get(url, verify=False)
        soup_data = bs(scraped_data.text, 'lxml')
        extracted_data = soup_data.find('div', {'id': 'pagi_content'})
        if len(extracted_data) == 0:
            break
        else:
            for idx in range(len(extracted_data)):
                if (idx % 2 == 1):
                    bid_data = extracted_data.contents[idx].text.strip().split('\n')

                    bidno = bid_data[0].split(":")[-1]
                    items = bid_data[5].split(":")[-1]
                    qnty = int(bid_data[6].split(':')[1].strip())
                    dept = (bid_data[10] + bid_data[12].strip()).split(":")[-1]
                    edate = bid_data[17].split("End Date:")[-1]
                    f.writerow([bidno, items, qnty, dept, edate])

            page_no=page_no+1
scrap_bid_data()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM