简体   繁体   中英

iterate over a pandas dataframe column to compare results

I have a pandas dataframe with a column of local neighborhoods. What I would like to do is go through this column and compare each neighborhood to each other, in the hopes of serializing the data. When I use a small subset of the data inside my python shell it works fine:

n = pd.DataFrame({'neighborhood':['Dupont Circle', 'Adams Morgan', 'alexandria', 'west end/dupont circle', 'logan circle', 'alexandria, va', 'washington', 'adam morgan/kalorama', 'Washington DC', 'Kalorama']})
print(n)
#results
#            neighborhood
#0           Dupont Circle
#1            Adams Morgan
#2              alexandria
#3  west end/dupont circle
#4            logan circle
#5          alexandria, va
#6              washington
#7    adam morgan/kalorama
#8           Washington DC
#9                Kalorama
for i in range(len(n['neighborhood'])):
    for j in range(i + 1, len(n['neighborhood'])):
        ratio = fw.partial_ratio(n['neighborhood'][i].lower(),n['neighborhood'][j].lower())
        print(n['neighborhood'][i]+' : '+n['neighborhood'][j]+' - '+str(ratio))
        if ratio>90:
            n['neighborhood'][j] = n['neighborhood'][i]
        print(n['neighborhood'][i]+' : '+n['neighborhood'][j])
print(n)
#results
#   neighborhood
#0  Dupont Circle
#1   Adams Morgan
#2     alexandria
#3  Dupont Circle
#4   logan circle
#5     alexandria
#6     washington
#7   Adams Morgan
#8     washington
#9       Kalorama

This is what I expected to happen. However, when I enlarge the scope in terms of running it against data that I scraped from craigslist I get a key error.

#this is from my main data source
neighborhood_results = post_results[['neighborhood']].copy()
neighborhood_results.to_csv('neighborhood_clean.csv',index=False)

for i in range(len(neighborhood_results['neighborhood'])):
    for j in range(i + 1, len(neighborhood_results['neighborhood'])):
            print(i)
            print(j)
            ratio = fw.partial_ratio(neighborhood_results['neighborhood'][i],neighborhood_results['neighborhood'][j])
            if ratio>90:
                neighborhood_results['neighborhood'][j] = neighborhood_results['neighborhood'][i]

When I run this code, the print(I) print(j) they return 0 and 1 as expected, but then I get my key error.

 ratio = fw.partial_ratio(neighborhood_results['neighborhood'][i],neighborhood_results['neighborhood'][j])

line 871, in getitem

result = self.index.get_value(self, key)

File "C:\Users\cards\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) File "pandas_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value File "pandas_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value File "pandas_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc File "pandas_libs\hashtable_class_helper.pxi", line 998, in pandas._libs.hashtable.Int64HashTable.get_item File "pandas_libs\hashtable_class_helper.pxi", line 1005, in pandas._libs.hashtable.Int64HashTable.get_item

KeyError: 0

My understanding is that this has to do with the lookup of the columns and the key. However, why does it work for a smaller data set, but not a larger?

full scraping code:

from bs4 import BeautifulSoup
import json
from requests import get
import numpy as np
import pandas as pd
import csv
from fuzzywuzzy import fuzz as fw

print('hello world')
#get the initial page for the listings, to get the total count
response = get('https://washingtondc.craigslist.org/search/hhh?query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
results = html_result.find('div', class_='search-legend')
total = int(results.find('span',class_='totalcount').text)
pages = np.arange(0,total+1,120)

neighborhood = []
bedroom_count =[]
sqft = []
price = []
link = []
count = 0
for page in pages:
    
    response = get('https://washingtondc.craigslist.org/search/hhh?s='+str(page)+'query=rent&availabilityMode=0&sale_date=all+dates')
    html_result = BeautifulSoup(response.text, 'html.parser')

    posts = html_result.find_all('li', class_='result-row')
    for post in posts:
        if post.find('span',class_='result-hood') is not None:
            post_url = post.find('a',class_='result-title hdrlnk')
            post_link = post_url['href']
            link.append(post_link)
            post_neighborhood = post.find('span',class_='result-hood').text
            post_price = int(post.find('span',class_='result-price').text.strip().replace('$',''))
            neighborhood.append(post_neighborhood)
            price.append(post_price)
            if post.find('span',class_='housing') is not None:
                if 'ft2' in post.find('span',class_='housing').text.split()[0]:
                    post_bedroom = np.nan
                    post_footage = post.find('span',class_='housing').text.split()[0][:-3]
                    bedroom_count.append(post_bedroom)
                    sqft.append(post_footage)
                elif len(post.find('span',class_='housing').text.split())>2:
                    post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
                    post_footage = post.find('span',class_='housing').text.split()[2][:-3]
                    bedroom_count.append(post_bedroom)
                    sqft.append(post_footage)
                elif len(post.find('span',class_='housing').text.split())==2:
                    post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
                    post_footage = np.nan
                    bedroom_count.append(post_bedroom)
                    sqft.append(post_footage)
            else:
                post_bedroom = np.nan
                post_footage = np.nan
                bedroom_count.append(post_bedroom)
                sqft.append(post_footage)
        count+=1
       
print(count)
#create results data frame
post_results = pd.DataFrame({'neighborhood':neighborhood,'footage':sqft,'bedroom':bedroom_count,'price':price,'link':link})
#clean up results
post_results.drop_duplicates(subset='link')
post_results['footage'] = post_results['footage'].replace(0,np.nan)
post_results['bedroom'] = post_results['bedroom'].replace(0,np.nan)
post_results['neighborhood'] = post_results['neighborhood'].str.strip().str.strip('(|)')
post_results['neighborhood'] = post_results['neighborhood'].str.lower()
post_results = post_results.dropna(subset=['footage','bedroom'],how='all')
post_results.to_csv("rent_clean.csv",index=False)

neighborhood_results = post_results[['neighborhood']].copy()
neighborhood_results.to_csv('neighborhood_clean.csv',index=False)

for i in range(len(neighborhood_results['neighborhood'])):
    for j in range(i + 1, len(neighborhood_results['neighborhood'])):
            print(i)
            print(j)
            ratio = fw.partial_ratio(neighborhood_results['neighborhood'][i],neighborhood_results['neighborhood'][j])
            if ratio>90:
                neighborhood_results['neighborhood'][j] = neighborhood_results['neighborhood'][i]

neighborhood_results.to_csv('neighborhood_clean_a.csv',index=False)

Let pandas do the job for you. It provides very simple functions for iterating over rows and columns:

It's very easy to lose track of how indexes are functioning in your code and by using iterators, you know you're accessing all possible items.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM