For loop is taking too long

Question

Edit: Here is the code above the loop for more helpful answers if it wasn't the problem.

import os
import pandas as pd 
import numpy 
import csv 
from math import *
ParcelSize = 50
UARFCN = 3087 

y= r"C:\Users\Heba R\Desktop\GP\Pilot1.csv"
x= r"C:\Users\Heba R\Desktop\GP\Cell.csv"

scanner_File = pd.read_csv(y) 
Cell_file = pd.read_csv(x)
Cells = Cell_file[['Cell', 'Lat', 'Lon', 'SC', 'UARFCN', 'ANT_DIRECTION']]

scanner = scanner_File[
        ['Latitude', 'Longitude', 'PSC: Top #1 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #1 (UARFCN #01)',
         'Sc Aggr Ec/Io (dB): Top #1 (UARFCN #01)',
         'PSC: Top #2 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #2 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #2 (UARFCN #01)',
         'PSC: Top #3 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #3 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #3 (UARFCN #01)',
         'PSC: Top #4 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #4 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #4 (UARFCN #01)',
         'PSC: Top #5 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #5 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #5 (UARFCN #01)',
         'PSC: Top #6 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #6 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #6 (UARFCN #01)',
         'PSC: Top #7 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #7 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #7 (UARFCN #01)',
         'PSC: Top #8 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #8 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #8 (UARFCN #01)',
         'PSC: Top #9 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #9 (UARFCN #01)',
         'Sc Aggr Ec/Io (dB): Top #9 (UARFCN #01)']]
scanner_size = scanner.shape[0] 
cells_size = Cells.shape[0]

def CalcDistanceM(lat1, lon1, lat2, lon2): 
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2]) #convert decimal to rad 
        #haversine formula to calculate two points great circle distance on earth
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance = 6371 * c * 1000 #radius of earth in km =6371
        return distance

def fn_CalcParcelID(Pos, ParcelUnitSize):
        if (Pos == 500):  # null parcel
            Result = int(50000000)
        elif (Pos < 0):
            Result = int(Pos * 100000) - ParcelUnitSize + (int(Pos * 100000) % ParcelUnitSize)
        else:
            Result = int(Pos * 100000) - (int(Pos * 100000) % ParcelUnitSize)
        return int(Result)

A1=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A2=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A3=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A4=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A5=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A6=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A7=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A8=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A9=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
for i in range (scanner_size):
       #if isnan(scanner['PSC: Top #1 (UARFCN #01)'][i]) == False:
       if (scanner['PSC: Top #1 (UARFCN #01)'][i]) != -1 :
          A1 = A1.append({ 'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                            'PSC': scanner['PSC: Top #1 (UARFCN #01)'][i],'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #1 (UARFCN #01)'][i],'RSCP': scanner['Sc Aggr Ec (dBm): Top #1 (UARFCN #01)'][i]}, ignore_index=True)

       if (scanner['PSC: Top #2 (UARFCN #01)'][i]) !=-1:
           A2 = A2.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #2 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #2 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #2 (UARFCN #01)'][i]}, ignore_index=True)
       if (scanner['PSC: Top #3 (UARFCN #01)'][i]) != -1:
           A3 = A3.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #3 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #3 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #3 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #4 (UARFCN #01)'][i]) != -1:
           A4 = A4.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #4 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #4 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #4 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #5 (UARFCN #01)'][i]) != -1:
           A5 = A5.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #5 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #5 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #5 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #6 (UARFCN #01)'][i]) != -1:
           A6 = A6.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #6 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #6 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #6 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #7 (UARFCN #01)'][i]) != -1:
           A7 = A7.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #7 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #7 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #7 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #8 (UARFCN #01)'][i]) != -1:
           A8 = A8.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #8 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #8 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #8 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #9 (UARFCN #01)'][i]) != -1:
           A9 = A9.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #9 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #9 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #9 (UARFCN #01)'][i]}, ignore_index=True)
A=pd.concat([A1,A2,A3,A4,A5,A6,A7,A8,A9],sort=False)
A = A[~A[['Latitude','Longitude','PSC','EcNo','RSCP']].apply(frozenset, axis=1).duplicated()] #~ is bitwise not frozenset elem remain unchanged after creation
A.to_csv('table_data_pilot.csv',index=True)
A = pd.read_csv('table_data_pilot.csv')
#A=A.iloc[:50,:].reset_index()
A_size = A.shape[0]

for i in range(A_size):
      j = i +1
      for j in range (A_size):
         dLat=A['Latitude'][i] - A['Latitude'][j]
         dLon=A['Longitude'][i] - A['Longitude'][j]
         if abs(dLat) < 0.00045 and abs(dLon) < 0.00045:
           distance = CalcDistanceM(A['Latitude'][j], A['Longitude'][j],
                                                A['Latitude'][i],
                                                A['Longitude'][i])
           print (distance)

B1 = pd.DataFrame(columns=['Lat','Lon','UARFCN','PSC','SC_Avg_EcNo','SC_Avg_RSCP'])

First of all, I just got started in Python and therefore I do not have much knowledge. I tried to search for similar problems and I could not find a proper solution.I am using the following code:

for i in range(A_size):
      x1=float(fn_CalcParcelID(A['Latitude'][i], ParcelSize) )/ 100000
      x2=float(fn_CalcParcelID(A['Longitude'][i], ParcelSize) ) / 100000
      B1 = B1.append({'Lat': x1, 'Lon': x2,
                            'PSC': A ['PSC'][i],
                            'UARFCN':UARFCN,
                            'SC_Avg_EcNo':A['EcNo'][i],
                            'SC_Avg_RSCP': A['RSCP'][i]

                            }, ignore_index=True)
B1.to_csv('B1.csv')

The loop is meant to calculate new latitude and longitude and then making a new csv file. A is a csv file that has almost 23000 rows and 42 column

Answer 1

In general, you should avoid iterating over a Pandas DataFrame using a for loop as much as possible.

The Pandas documentation on Iteration says:

Warning

Iterating through pandas objects is generally slow . In many cases, iterating manually over the rows is not needed and can be avoided with one of the following approaches:

Look for a vectorized solution: many operations can be performed using built-in methods or NumPy functions, (boolean) indexing, …

When you have a function that cannot work on the full DataFrame/Series at once, it is better to use apply() instead of iterating over the values. See the docs on function application.

Furthermore, using append() to add a new row to the DataFrame inside a loop is quite problematic.

Docs on Concat explain it:

Adding a column to a DataFrame is relatively fast. However, adding a row requires a copy, and may be expensive. We recommend passing a pre-built list of records to the DataFrame constructor instead of building a DataFrame by iteratively appending records to it. See Appending to dataframe for more.

If you do that in a loop, then each iteration of the loop will be copying all data from the DataFrame to a new DataFrame, just to add the one row. Also, this operation becomes more expensive every time, since the DataFrame keeps growing and you'll have more data to copy every time.

In your specific case, you can easily avoid most of that, by processing A as a whole , generating all the rows you want to append to B1, then doing a single append() operation, meaning you only need to copy B1 once.

Putting it all together:

rows_to_add = pd.DataFrame({
    'Lat': A['Latitude'].apply(
        lambda x: fn_CalcParcelID(x, ParcelSize) / 100000.0
    ),
    'Lon': A['Longitude'].apply(
        lambda x: fn_CalcParcelID(x, ParcelSize) / 100000.0
    ),
    'PSC': A['PSC'],
    'UARFCN': UARFCN,
    'SC_Avg_EcNo': A['EcNo'],
    'SC_Avg_RSCP': A['RSCP'],
})
B1 = B1.append(rows_to_add, ignore_index=True)

This should get you from running minutes without seeing an end to having this operation complete in seconds.

You can optimize it further by implementing fn_CalcParcelID() using vectorized operations. (Hard to tell how to do that, since you don't show us the implementation of that function.) But the first optimization might be all you need. Ask a new question about vectorizing fn_CalcParcelID() if you feel that might be worth it.

UPDATE: You do have a version of the same problem with the first part of your code, where you're looping through the scanner CSV file and reorganizing it into A1 through A9. (You have an A1 = A1.append(...) inside each of those for loops, so you do have appends in a loop there as well!)

You can address that with:

A1_rows = scanner[scanner['PSC: Top #1 (UARFCN #01)'] != -1]
A1 = pd.DataFrame({
    'Latitude': A1_rows['Latitude'],
    'Longitude': A1_rows['Longitude'],
    'PSC': A1_rows['PSC: Top #1 (UARFCN #01)'],
    'EcNo': A1_rows['Sc Aggr Ec/Io (dB): Top #1 (UARFCN #01)'],
    'RSCP': A1_rows['Sc Aggr Ec (dBm): Top #1 (UARFCN #01)'],
})

And similar for the other 8 similar DataFrames.

Answer 2

如果您使用的是熊猫，则使用 iterrows()。

   for idx,row in df.iterrows():

For loop is taking too long

Question

2 answers

solution1
2 ACCPTED 2020-02-09 17:47:35

solution2
0 2020-02-09 11:42:30

For loop is taking too long

Question

2 answers

solution1 2 ACCPTED 2020-02-09 17:47:35

solution2 0 2020-02-09 11:42:30

solution1
2 ACCPTED 2020-02-09 17:47:35

solution2
0 2020-02-09 11:42:30