For 循环花费的时间太长

Question

Edit: Here is the code above the loop for more helpful answers if it wasn't the problem.编辑：这是循环上方的代码，如果不是问题，可以提供更多有用的答案。

import os
import pandas as pd 
import numpy 
import csv 
from math import *
ParcelSize = 50
UARFCN = 3087 

y= r"C:\Users\Heba R\Desktop\GP\Pilot1.csv"
x= r"C:\Users\Heba R\Desktop\GP\Cell.csv"

scanner_File = pd.read_csv(y) 
Cell_file = pd.read_csv(x)
Cells = Cell_file[['Cell', 'Lat', 'Lon', 'SC', 'UARFCN', 'ANT_DIRECTION']]

scanner = scanner_File[
        ['Latitude', 'Longitude', 'PSC: Top #1 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #1 (UARFCN #01)',
         'Sc Aggr Ec/Io (dB): Top #1 (UARFCN #01)',
         'PSC: Top #2 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #2 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #2 (UARFCN #01)',
         'PSC: Top #3 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #3 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #3 (UARFCN #01)',
         'PSC: Top #4 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #4 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #4 (UARFCN #01)',
         'PSC: Top #5 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #5 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #5 (UARFCN #01)',
         'PSC: Top #6 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #6 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #6 (UARFCN #01)',
         'PSC: Top #7 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #7 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #7 (UARFCN #01)',
         'PSC: Top #8 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #8 (UARFCN #01)', 'Sc Aggr Ec/Io (dB): Top #8 (UARFCN #01)',
         'PSC: Top #9 (UARFCN #01)', 'Sc Aggr Ec (dBm): Top #9 (UARFCN #01)',
         'Sc Aggr Ec/Io (dB): Top #9 (UARFCN #01)']]
scanner_size = scanner.shape[0] 
cells_size = Cells.shape[0]

def CalcDistanceM(lat1, lon1, lat2, lon2): 
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2]) #convert decimal to rad 
        #haversine formula to calculate two points great circle distance on earth
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance = 6371 * c * 1000 #radius of earth in km =6371
        return distance

def fn_CalcParcelID(Pos, ParcelUnitSize):
        if (Pos == 500):  # null parcel
            Result = int(50000000)
        elif (Pos < 0):
            Result = int(Pos * 100000) - ParcelUnitSize + (int(Pos * 100000) % ParcelUnitSize)
        else:
            Result = int(Pos * 100000) - (int(Pos * 100000) % ParcelUnitSize)
        return int(Result)

A1=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A2=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A3=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A4=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A5=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A6=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A7=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A8=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
A9=pd.DataFrame(columns=['Latitude','Longitude','PSC','EcNo','RSCP'])
for i in range (scanner_size):
       #if isnan(scanner['PSC: Top #1 (UARFCN #01)'][i]) == False:
       if (scanner['PSC: Top #1 (UARFCN #01)'][i]) != -1 :
          A1 = A1.append({ 'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                            'PSC': scanner['PSC: Top #1 (UARFCN #01)'][i],'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #1 (UARFCN #01)'][i],'RSCP': scanner['Sc Aggr Ec (dBm): Top #1 (UARFCN #01)'][i]}, ignore_index=True)

       if (scanner['PSC: Top #2 (UARFCN #01)'][i]) !=-1:
           A2 = A2.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #2 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #2 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #2 (UARFCN #01)'][i]}, ignore_index=True)
       if (scanner['PSC: Top #3 (UARFCN #01)'][i]) != -1:
           A3 = A3.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #3 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #3 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #3 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #4 (UARFCN #01)'][i]) != -1:
           A4 = A4.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #4 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #4 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #4 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #5 (UARFCN #01)'][i]) != -1:
           A5 = A5.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #5 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #5 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #5 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #6 (UARFCN #01)'][i]) != -1:
           A6 = A6.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #6 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #6 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #6 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #7 (UARFCN #01)'][i]) != -1:
           A7 = A7.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #7 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #7 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #7 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #8 (UARFCN #01)'][i]) != -1:
           A8 = A8.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #8 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #8 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #8 (UARFCN #01)'][i]}, ignore_index=True)
       if  (scanner['PSC: Top #9 (UARFCN #01)'][i]) != -1:
           A9 = A9.append({'Latitude': scanner['Latitude'][i], 'Longitude': scanner['Longitude'][i],
                           'PSC': scanner['PSC: Top #9 (UARFCN #01)'][i],
                           'EcNo': scanner['Sc Aggr Ec/Io (dB): Top #9 (UARFCN #01)'][i],
                           'RSCP': scanner['Sc Aggr Ec (dBm): Top #9 (UARFCN #01)'][i]}, ignore_index=True)
A=pd.concat([A1,A2,A3,A4,A5,A6,A7,A8,A9],sort=False)
A = A[~A[['Latitude','Longitude','PSC','EcNo','RSCP']].apply(frozenset, axis=1).duplicated()] #~ is bitwise not frozenset elem remain unchanged after creation
A.to_csv('table_data_pilot.csv',index=True)
A = pd.read_csv('table_data_pilot.csv')
#A=A.iloc[:50,:].reset_index()
A_size = A.shape[0]

for i in range(A_size):
      j = i +1
      for j in range (A_size):
         dLat=A['Latitude'][i] - A['Latitude'][j]
         dLon=A['Longitude'][i] - A['Longitude'][j]
         if abs(dLat) < 0.00045 and abs(dLon) < 0.00045:
           distance = CalcDistanceM(A['Latitude'][j], A['Longitude'][j],
                                                A['Latitude'][i],
                                                A['Longitude'][i])
           print (distance)

B1 = pd.DataFrame(columns=['Lat','Lon','UARFCN','PSC','SC_Avg_EcNo','SC_Avg_RSCP'])

First of all, I just got started in Python and therefore I do not have much knowledge.首先，我刚开始使用 Python，因此我没有太多知识。 I tried to search for similar problems and I could not find a proper solution.I am using the following code:我试图搜索类似的问题，但找不到合适的解决方案。我正在使用以下代码：

for i in range(A_size):
      x1=float(fn_CalcParcelID(A['Latitude'][i], ParcelSize) )/ 100000
      x2=float(fn_CalcParcelID(A['Longitude'][i], ParcelSize) ) / 100000
      B1 = B1.append({'Lat': x1, 'Lon': x2,
                            'PSC': A ['PSC'][i],
                            'UARFCN':UARFCN,
                            'SC_Avg_EcNo':A['EcNo'][i],
                            'SC_Avg_RSCP': A['RSCP'][i]

                            }, ignore_index=True)
B1.to_csv('B1.csv')

The loop is meant to calculate new latitude and longitude and then making a new csv file.该循环旨在计算新的纬度和经度，然后制作一个新的 csv 文件。 A is a csv file that has almost 23000 rows and 42 column A 是一个 csv 文件，它有近 23000 行和 42 列

Answer 1

In general, you should avoid iterating over a Pandas DataFrame using a for loop as much as possible.通常，您应该尽可能避免使用for循环遍历 Pandas DataFrame。

The Pandas documentation on Iteration says:关于迭代的 Pandas 文档说：

Warning警告

Iterating through pandas objects is generally slow .遍历 pandas 对象通常很慢。 In many cases, iterating manually over the rows is not needed and can be avoided with one of the following approaches:在许多情况下，不需要手动迭代行，可以使用以下方法之一来避免：

Look for a vectorized solution: many operations can be performed using built-in methods or NumPy functions, (boolean) indexing, …寻找矢量化解决方案：可以使用内置方法或 NumPy 函数、（布尔值）索引等执行许多操作。

When you have a function that cannot work on the full DataFrame/Series at once, it is better to use apply() instead of iterating over the values.当您的函数无法同时处理完整的 DataFrame/Series 时，最好使用apply()而不是迭代这些值。 See the docs on function application.请参阅函数应用程序的文档。

Furthermore, using append() to add a new row to the DataFrame inside a loop is quite problematic.此外，使用append()在循环内向 DataFrame 添加新行是非常有问题的。

Docs on Concat explain it: Concat 上的文档对此进行了解释：

Adding a column to a DataFrame is relatively fast.向 DataFrame 添加一列相对较快。 However, adding a row requires a copy, and may be expensive.但是，添加一行需要一个副本，并且可能很昂贵。 We recommend passing a pre-built list of records to the DataFrame constructor instead of building a DataFrame by iteratively appending records to it.我们建议将预先构建的记录列表传递给 DataFrame 构造函数，而不是通过迭代地向其添加记录来构建 DataFrame。 See Appending to dataframe for more.有关更多信息，请参阅附加到数据框。

If you do that in a loop, then each iteration of the loop will be copying all data from the DataFrame to a new DataFrame, just to add the one row.如果您在循环中执行此操作，则循环的每次迭代都会将 DataFrame 中的所有数据复制到新的 DataFrame 中，只需添加一行即可。 Also, this operation becomes more expensive every time, since the DataFrame keeps growing and you'll have more data to copy every time.此外，此操作每次都会变得更加昂贵，因为 DataFrame 不断增长并且您每次都会有更多数据要复制。

In your specific case, you can easily avoid most of that, by processing A as a whole , generating all the rows you want to append to B1, then doing a single append() operation, meaning you only need to copy B1 once.在您的特定情况下，您可以轻松避免大部分情况，方法是将 A作为一个整体处理，生成要附加到 B1 的所有行，然后执行单个append()操作，这意味着您只需要复制 B1 一次。

Putting it all together:把它们放在一起：

rows_to_add = pd.DataFrame({
    'Lat': A['Latitude'].apply(
        lambda x: fn_CalcParcelID(x, ParcelSize) / 100000.0
    ),
    'Lon': A['Longitude'].apply(
        lambda x: fn_CalcParcelID(x, ParcelSize) / 100000.0
    ),
    'PSC': A['PSC'],
    'UARFCN': UARFCN,
    'SC_Avg_EcNo': A['EcNo'],
    'SC_Avg_RSCP': A['RSCP'],
})
B1 = B1.append(rows_to_add, ignore_index=True)

This should get you from running minutes without seeing an end to having this operation complete in seconds.这应该可以让您从运行几分钟而看不到结束在几秒钟内完成此操作。

You can optimize it further by implementing fn_CalcParcelID() using vectorized operations.您可以通过使用矢量化操作实现fn_CalcParcelID()来进一步优化它。 (Hard to tell how to do that, since you don't show us the implementation of that function.) But the first optimization might be all you need. （很难说如何做到这一点，因为您没有向我们展示该函数的实现。）但是第一个优化可能就是您所需要的。 Ask a new question about vectorizing fn_CalcParcelID() if you feel that might be worth it.如果您觉得值得，请提出一个关于向量化fn_CalcParcelID()的新问题。

UPDATE: You do have a version of the same problem with the first part of your code, where you're looping through the scanner CSV file and reorganizing it into A1 through A9.更新：您的代码的第一部分确实存在相同问题的版本，您在其中循环scanner CSV 文件并将其重新组织为 A1 到 A9。 (You have an A1 = A1.append(...) inside each of those for loops, so you do have appends in a loop there as well!) （您在每个for循环中都有一个A1 = A1.append(...) ，因此您在循环中也有追加内容！）

You can address that with:您可以通过以下方式解决该问题：

A1_rows = scanner[scanner['PSC: Top #1 (UARFCN #01)'] != -1]
A1 = pd.DataFrame({
    'Latitude': A1_rows['Latitude'],
    'Longitude': A1_rows['Longitude'],
    'PSC': A1_rows['PSC: Top #1 (UARFCN #01)'],
    'EcNo': A1_rows['Sc Aggr Ec/Io (dB): Top #1 (UARFCN #01)'],
    'RSCP': A1_rows['Sc Aggr Ec (dBm): Top #1 (UARFCN #01)'],
})

And similar for the other 8 similar DataFrames.其他 8 个类似的 DataFrame 也类似。

Answer 2

如果您使用的是熊猫，则使用 iterrows()。

   for idx,row in df.iterrows():

For 循环花费的时间太长

问题描述

2 个解决方案

解决方案1
2 已采纳 2020-02-09 17:47:35

解决方案2
0 2020-02-09 11:42:30

For 循环花费的时间太长

问题描述

2 个解决方案

解决方案1 2 已采纳 2020-02-09 17:47:35

解决方案2 0 2020-02-09 11:42:30

解决方案1
2 已采纳 2020-02-09 17:47:35

解决方案2
0 2020-02-09 11:42:30