简体   繁体   中英

how to compare two strings in pandas large dataframe (python3.x)?

I have two DFs from 2 excel files.

1st file(awcProjectMaster)(1500 records)

projectCode    projectName
  100101       kupwara
  100102       kalaroos
  100103       tangdar

2nd file(village master)(more than 10 million records)

villageCode    villageName
   425638          wara
   783651          tangdur
   986321          kalaroo

I need to compare the projectName and villageName along with the percentage match. The following code works fine but it is slow. How can I do the same thing in a more efficient way.

import pandas as pd
from datetime import datetime

df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")


def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
    with open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a") as f:
        percentMatch = 0
        vLen = len(vName)
        prjLen = len(prjName)
        if vLen > prjLen:
            if vName.find(prjName) != -1:
                percentMatch = (prjLen / vLen) * 100
                f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
            else:
                res = 0
                # print(res)
        elif prjLen >= vLen:
            if prjName.find(vName) != -1:
                percentMatch = (vLen / prjLen) * 100
                f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
            else:
                res = 0
                # print(res)
    f.close()


for idx, row in df.iterrows():
    for idxv, r in df1.iterrows():
        compare(
            str(row["ProjectCode"]),
            row["ProjectName"].lower(),
            str(r["StateCensusCode"]),
            r["StateName"],
            str(r["DistrictCode"]),
            r["DistrictName"],
            str(r["SubDistrictCode"]),
            r["SubDistrictNameInEnglish"],
            str(r["VillageCode"]),
            r["VillageNameInEnglish"].lower(),
        )

Your distance metric for the strings isn't too accurate, but if it works for you, fine. (You may want to look into other options like the builtin difflib , or the Python-Levenshtein module, though.)

If you really do need to compare 1,500 x 10,000,000 records pairwise, things are bound to take some time, but there are a couple things that we can do pretty easily to speed things up:

  • open the log file only once; there's overhead, sometimes significant, in that
  • refactor your comparison function into a separate unit, then apply the lru_cache() memoization decorator to make sure each pair is compared only once, and the subsequent result is cached in memory. (In addition, see how we sort the vName / prjName pair – since the actual order of the two strings doesn't matter, we end up with half the cache size.)

Then for general cleanliness,

  • use the csv module for streaming CSV into a file (the output format is slightly different than with your code, but you can change this with the dialect parameter to csv.writer() ).

Hope this helps!

import pandas as pd
from datetime import datetime
from functools import lru_cache
import csv

df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")

log_file = open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a")
log_writer = csv.writer(log_file)


@lru_cache()
def compare_vname_prjname(vName, prjName):
    vLen = len(vName)
    prjLen = len(prjName)
    if vLen > prjLen:
        if vName.find(prjName) != -1:
            return (prjLen / vLen) * 100
    elif prjLen >= vLen:
        if prjName.find(vName) != -1:
            return (vLen / prjLen) * 100
    return None


def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
    # help the cache decorator out by halving the number of possible pairs:
    vName, prjName = sorted([vName, prjName])
    percent_match = compare_vname_prjname(vName, prjName)
    if percent_match is None:  # No match
        return False
    log_writer.writerow(
        [
            prjCode,
            prjName,
            vCode,
            vName,
            round(percent_match),
            stCode,
            stName,
            dCode,
            dName + sdCode,
            sdName,
        ]
    )
    return True


for idx, row in df.iterrows():
    for idxv, r in df1.iterrows():
        compare(
            str(row["ProjectCode"]),
            row["ProjectName"].lower(),
            str(r["StateCensusCode"]),
            r["StateName"],
            str(r["DistrictCode"]),
            r["DistrictName"],
            str(r["SubDistrictCode"]),
            r["SubDistrictNameInEnglish"],
            str(r["VillageCode"]),
            r["VillageNameInEnglish"].lower(),
        )

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM