I'm scraping data from a list of url (input.txt) and saving data in output.txt
I want to delete those urls from input file as soon as they are scraped in the loop.
This is my code:
def scrape(url):
//do scraping and return json
return json
with open("input.txt",
'r+') as urllist, open('output.txt',
'a+') as outfile:
for url in urllist.read().splitlines():
data = scrape(url)
if data:
if data['products'] is None:
print("data NOT FOUND: %s")
else:
for product in data['products']:
print("Saving data: %s" % product['data'])
outfile.write(product['data'])
outfile.write("\n")
I have included this code in the loop to delete the url when it passes through the loop but it deletes all the urls at once not one by one
#start new code
d = urllist.readlines()
urllist.seek(0)
for i in d:
if i != url:
urllist.write(i)
input.txt file contains following data:
url1
url2
url3
While output.txt file:
data1
data2
data3
I am referring to this code
I have shared an example of removing a line from a file after using that line. Note that I added a function named "printFileContents" to show you what happens to the file contents after each iteration of scraping. That function is not actually necessary, just nice to visualize what is happening. See example below:
def scrape(url):
# Do some stuff
return True
def executeScrapeIteration(input_file):
# Get the first line in the file
url = input_file.readline()
# Do your scraping and whatever else
scrape(url)
# To remove the line you just used, you have to rewrite the file, but don't include that line
lines = input_file.readlines()
input_file.seek(0)
input_file.truncate()
for line in lines:
if line != url:
input_file.write(line)
# This function is just to show you what happens to the file after each scrape iteration
def printFileContents(input_file, i):
input_file.seek(0)
print("-----------------")
print("After iteration " + str(i) + ":\n")
print(input_file.read())
print("\n-----------------\n\n")
input_file.seek(0)
# main function
if __name__=="__main__":
with open("input.txt",'r+') as input_file:
# Count the lines and then reset the pointer to 0 position
line_count = len(input_file.readlines())
input_file.seek(0)
# While the file still contains url, execute an iteration of scraping
for x in range(0, line_count):
executeScrapeIteration(input_file)
printFileContents(input_file, x)
My input.txt file is as follows:
url1
url2
url3
Just copy/paste my python script and input.txt file, then run the python script.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.