多个 Python 脚本与不同的输入同时运行

Question

所以我有这个 python 脚本，它从用户构建的特定 craigslist URL 中删除列表（位置、最高价格、项目类型等）。 然后它转到 URL，抓取列表信息（价格、发布日期等）并返回三个输出。 一个是平均价格附近的“x”个项目（用户确定项目的数量和价格范围，例如平均价格低 100 美元）。 接下来，是基于用户在乞讨中提供的 zip 代码的“x”壁橱列表（用户还根据与 zip 代码的接近程度来确定显示的项目数）。 最后，craigslist url 链接被输出给用户，这样他们就可以访问该页面并查看之前显示给他们的项目。 抓取的数据存储在 data.json 文件和 data.csv 文件中。内容相同，只是格式不同，我想在每次抓取完成时将这些数据卸载到数据库中。 Cloud Firestore 或 AWS DynamoDB，因为我想在未来托管这个 web 应用程序

我想要做的是允许用户拥有相同脚本的多个实例，所有这些实例都具有同时运行的唯一 craigslist url。 所有代码都是相同的，唯一的区别是脚本抓取的 craigslist url。

我创建了一个方法，通过创建属性（位置、最大价格等）进行迭代并返回丢失的 url，但在我的主要部分中，我调用了构造函数，它需要所有这些属性，所以我必须从中钓鱼网址，这似乎在顶部。

然后我试图在我的主循环中使用循环。 用户确定他们想要创建多少个 url 链接和 append 完成的链接到列表。 再次遇到同样的问题。

class CraigslistScraper(object):

# Contructor of the URL that is being scraped
def __init__(self, location, postal_code, max_price, query, radius):
    self.location = location  # Location(i.e. City) being searched
    self.postal_code = postal_code  # Postal code of location being searched
    self.max_price = max_price  # Max price of the items that will be searched
    self.query = query  # Search for the type of items that will be searched
    self.radius = radius  # Radius of the area searched derived from the postal code given previously

    self.url = f"https://{location}.craigslist.org/search/sss?&max_price={max_price}&postal={postal_code}&query={query}&20card&search_distance={radius}"
    self.driver = webdriver.Chrome(r"C:\Program Files\chromedriver")  # Path of Firefox web driver
    self.delay = 7  # The delay the driver gives when loading the web page

# Load up the web page
# Gets all relevant data on the page
# Goes to next page until we are at the last page
def load_craigslist_url(self):

    data = []
    # url_list = []
    self.driver.get(self.url)
    while True:
        try:
            wait = WebDriverWait(self.driver, self.delay)
            wait.until(EC.presence_of_element_located((By.ID, "searchform")))
            data.append(self.extract_post_titles())
            # url_list.append(self.extract_post_urls())
            WebDriverWait(self.driver, 2).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform"]/div[3]/div[3]/span[2]/a[3]'))).click()
        except:
            break
    return data

# Extracts all relevant information from the web-page and returns them as individual lists
def extract_post_titles(self):

    all_posts = self.driver.find_elements_by_class_name("result-row")

    dates_list = []
    titles_list = []
    prices_list = []
    distance_list = []

    for post in all_posts:

        title = post.text.split("$")

        if title[0] == '':
            title = title[1]
        else:
            title = title[0]

        title = title.split("\n")
        price = title[0]
        title = title[-1]
        title = title.split(" ")
        month = title[0]
        day = title[1]
        title = ' '.join(title[2:])
        date = month + " " + day

        if not price[:1].isdigit():
            price = "0"
        int(price)

        raw_distance = post.find_element_by_class_name(
            'maptag').text
        distance = raw_distance[:-2]

        titles_list.append(title)
        prices_list.append(price)
        dates_list.append(date)
        distance_list.append(distance)

    return titles_list, prices_list, dates_list, distance_list

# Gets all of the url links of each listing on the page
# def extract_post_urls(self):
#     soup_list = []
#     html_page = urllib.request.urlopen(self.driver.current_url)
#     soup = BeautifulSoup(html_page, "html.parser")
#     for link in soup.findAll("a", {"class": "result-title hdrlnk"}):
#         soup_list.append(link["href"])
#
#     return soup_list

# Kills browser
def kill(self):
    self.driver.close()

# Gets price value from dictionary and computes average
@staticmethod
def get_average(sample_dict):

    price = list(map(lambda x: x['Price'], sample_dict))
    sum_of_prices = sum(price)
    length_of_list = len(price)
    average = round(sum_of_prices / length_of_list)

    return average

# Displays items around the average price of all the items in prices_list
@staticmethod
def get_items_around_average(avg, sample_dict, counter, give):
    print("Items around average price: ")
    print("-------------------------------------------")
    raw_list = []
    for z in range(len(sample_dict)):
        current_price = sample_dict[z].get('Price')
        if abs(current_price - avg) <= give:
            raw_list.append(sample_dict[z])
    final_list = raw_list[:counter]
    for index in range(len(final_list)):
        print('\n')
        for key in final_list[index]:
            print(key, ':', final_list[index][key])

# Displays nearest items to the zip provided
@staticmethod
def get_items_around_zip(sample_dict, counter):
    final_list = []
    print('\n')
    print("Closest listings: ")
    print("-------------------------------------------")
    x = 0
    while x < counter:
        final_list.append(sample_dict[x])
        x += 1
    for index in range(len(final_list)):
        print('\n')
        for key in final_list[index]:
            print(key, ':', final_list[index][key])

# Converts all_of_the_data list of dictionaries to json file
@staticmethod
def convert_to_json(sample_list):
    with open(r"C:\Users\diego\development\WebScraper\data.json", 'w') as file_out:
        file_out.write(json.dumps(sample_list, indent=4))

@staticmethod
def convert_to_csv(sample_list):
    df = pd.DataFrame(sample_list)
    df.to_csv("data.csv", index=False, header=True)


# Main where the big list data is broken down to its individual parts to be converted to a .csv file

还设置了网站的参数

如果名称== “主要”：

location = input("Enter the location you would like to search: ")  # Location Craigslist searches
zip_code = input(
    "Enter the zip code you would like to base radius off of: ")  # Postal code Craigslist uses as a base for 'MILES FROM ZIP'
type_of_item = input(
    "Enter the item you would like to search (ex. furniture, bicycles, cars, etc.): ")  # Type of item you are looking for
max_price = input(
    "Enter the max price you would like the search to use: ")  # Max price Craigslist limits the items too
radius = input(
    "Enter the radius you would like the search to use (based off of zip code provided earlier): ")  # Radius from postal code Craigslist limits the search to

scraper = CraigslistScraper(location, zip_code, max_price, type_of_item,
                            radius)  # Constructs the URL with the given parameters

results = scraper.load_craigslist_url()  # Inserts the result of the scrapping into a large multidimensional list

titles_list = results[0][0]
prices_list = list(map(int, results[0][1]))
dates_list = results[0][2]
distance_list = list(map(float, results[0][3]))

scraper.kill()

# Merge all of the lists into a dictionary
# Dictionary is then sorted by distance from smallest -> largest
list_of_attributes = []

for i in range(len(titles_list)):
    content = {'Listing': titles_list[i], 'Price': prices_list[i], 'Date posted': dates_list[i],
               'Distance from zip': distance_list[i]}
    list_of_attributes.append(content)

list_of_attributes.sort(key=lambda x: x['Distance from zip'])

scraper.convert_to_json(list_of_attributes)
scraper.convert_to_csv(list_of_attributes)
# scraper.export_to_mongodb()

# Below function calls:
# Get average price and prints it
# Gets/prints listings around said average price
# Gets/prints nearest listings

average = scraper.get_average(list_of_attributes)
print(f'Average price of items searched: ${average}')
num_items_around_average = int(input("How many listings around the average price would you like to see?: "))
avg_range = int(input("Range of listings around the average price: "))
scraper.get_items_around_average(average, list_of_attributes, num_items_around_average, avg_range)
print("\n")
num_items = int(input("How many items would you like to display based off of proximity to zip code?: "))
print(f"Items around you: ")
scraper.get_items_around_zip(list_of_attributes, num_items)
print("\n")
print(f"Link of listings : {scraper.url}")

我想要的是获取用户想要抓取的 URL 数量的程序。 该输入将确定需要运行的该脚本的实例数。

然后用户将运行每个刮刀的提示，例如制作url（“您要搜索什么位置？：”）。 在他们完成创建 url 之后，每个爬虫将使用其特定的 url 运行，并显示回上面描述的三个 output 特定于 url 指定的爬虫。

将来我想添加一个时间 function 并且用户确定他们希望脚本运行的频率（每小时、每天、每隔一天等）。 连接到数据库，而只是从数据库中查询平均价格范围内的“x”个列表，以及基于特定 url 结果的接近度的“x”个最接近的列表。

Answer 1

如果您希望在 main 循环运行时并行运行多个刮板实例，则需要使用子进程。

https://docs.python.org/3/library/subprocess.html

多个 Python 脚本与不同的输入同时运行

问题描述

还设置了网站的参数

1 个解决方案

解决方案1
0 2019-10-04 20:04:18

多个 Python 脚本与不同的输入同时运行

问题描述

还设置了网站的参数

1 个解决方案

解决方案1 0 2019-10-04 20:04:18

解决方案1
0 2019-10-04 20:04:18