I want to parse through a contacts list CSV file that looks like this:
First Name Last Name Full Name Short Name Phone Number
Jenny Smith CN=Jenny Smith/OU=CORP/O=COMPANY jesmi 6468675309
Mary Poppin CN=Mary Poppins/OU=STORE/O=COMPANY mapop 7005555578
Tony Stark CN=Tony Stark/OU=STORE/O=COMPANY tostar 6007777798
Peter Parker CN=Peter Parker/OU=NEWS/O=COMPANY pepar 5008889090
I want to be able to search through column "Full Name" and select string "OU=STORE" and move all rows that contain "OU=STORE" aside, and then move it to it's own csv file called "store.csv". Then repeat the same process for "OU=CORP" and "OU=NEWS".
This is what I want my output to look like:
Store.csv should contain only this information once the process is done.
First Name Last Name Full Name Short Name Phone Number
Mary Poppin CN=Mary Poppins/OU=STORE/O=COMPANY mapop 7005555578
Tony Stark CN=Tony Stark/OU=STORE/O=COMPANY tostar 6007777798
corp.csv
First Name Last Name Full Name Short Name Phone Number
Jenny Smith CN=Jenny Smith/OU=CORP/O=COMPANY jesmi 6468675309
news.csv
First Name Last Name Full Name Short Name Phone Number
Peter Parker CN=Peter Parker/OU=NEWS/O=COMPANY pepar 5008889090
I have a small script of what I've done so far but I'm not sure what to do in the end:
import pandas as pd
import csv
#this is the source folder
source_dir = 'C:/Users/username/documents/contacts/contactslist.csv'
#this is the folder where I want to move the parsed data.
store_target_dir = 'C:/Users/username/documents/contacts/store/'
corp_target_dir = 'C:/Users/username/documents/contacts/corp/'
news_target_dir = 'C:/Users/username/documents/contacts/news/'
col_list = ["Full Name"]
store = 'OU=STORE'
corp = 'OU=CORP'
news = 'OU=NEWS'
#When it comes time to move the data to their folders with their csv name
csvName = store_target_dir + "/" + "store.csv"
csvName2 = corp_target_dir + "/" + "corp.csv"
csvName3 = news_target_dir + "/" +"news.csv"
#opening the file
file = open(source_dir)
#reading the csv file
df = pd.read_csv(file)
To filter you DataFrame, you could do something like this:
# key is the value you are looking for, e.g. 'OU=STORE'
indices = [key in value for value in df['File Name']]
subset = df[indices]
indices
is a bool list indicating whether a line contains key
or not
You could extract the OU=
value and add it as another column. .unique()
could then be used to determine the 3 possible values and then each CSV created based on the that value. For example:
import pandas as pd
df = pd.read_csv('contactslist.csv', dtype={'Phone Number': str})
df['file'] = df['Full Name'].str.extract(r'OU=(\S+)/')
for key in df['file'].unique():
df_filtered = df.loc[df['file'] == key]
df_filtered = df_filtered.drop(['file'], axis=1)
df_filtered.to_csv(f"{key}.csv", index=False)
Trying to not do too much modifications to your code a solution could look like this:
import os, os.path
import csv
#this is the source folder
original_contacts_filename = r'C:\Users\username\documents\contacts\contactslist.csv'
#this is the folder where I want to move the parsed data.
store_target_dir = r'C:\Users\username\documents\contacts\store'
corp_target_dir = r'C:\Users\username\documents\contacts\corp'
news_target_dir = r'C:\Users\username\documents\contacts\news'
os.makedirs(store_target_dir, exist_ok=True)
os.makedirs(corp_target_dir, exist_ok=True)
os.makedirs(news_target_dir, exist_ok=True)
store = 'OU=STORE'
corp = 'OU=CORP'
news = 'OU=NEWS'
#When it comes time to move the data to their folders with their csv name
csv_name = os.path.join(store_target_dir, "store.csv")
csv_name2 = os.path.join(corp_target_dir, "corp.csv")
csv_name3 = os.path.join(news_target_dir, "news.csv")
with (
open(original_contacts_filename, newline='') as original_contacts_file,
open(csv_name, mode='w', newline='') as csv_file,
open(csv_name2, mode='w', newline='') as csv_file2,
open(csv_name3, mode='w', newline='') as csv_file3):
original_contacts = csv.DictReader(original_contacts_file)
store_destination = csv.writer(csv_file)
corp_destination = csv.writer(csv_file2)
news_destionation = csv.writer(csv_file3)
output_headers = ('First Name', 'Last Name', 'Full Name', 'Short Name', 'Phone Number')
store_destination.writerow(output_headers)
corp_destination.writerow(output_headers)
news_destionation.writerow(output_headers)
for current_contact in original_contacts:
if store in current_contact['Full Name']:
output_destination = store_destination
elif corp in current_contact['Full Name']:
output_destination = corp_destination
elif news in current_contact['Full Name']:
output_destination = news_destionation
else:
output_destination = None
if output_destination is not None:
output_destination.writerow(current_contact[column] for column in output_headers)
But we can see a lot of repetitions which usually smells. We could simplyfy the code like this:
import os, os.path
import csv
import re
original_contacts_filename = 'contactslist.csv'
source_directory = r'C:\Users\username\documents\contacts'
corporate_units_expected = ('store', 'corp', 'news')
target_directory = r'C:\Users\username\documents\contacts'
target_files_info = {
current_unit: (
current_name := os.path.join(target_directory, current_unit, f'{current_unit}2.csv'),
open(current_name, 'w', newline='')
)
for current_unit in corporate_units_expected
}
for current_target, _ in target_files_info.values():
os.makedirs(os.path.dirname(current_target), exist_ok=True)
matcher = re.compile(r'OU=([ 0-1a-zA-Z]+)')
with (
open(original_contacts_filename, newline='') as original_contacts_file,
target_files_info['store'][1], target_files_info['corp'][1], target_files_info['news'][1]
):
original_contacts = csv.DictReader(original_contacts_file)
writers = {
current_unit: csv.writer(current_target[1])
for (current_unit, current_target) in target_files_info.items()
}
output_headers = ('First Name', 'Last Name', 'Full Name', 'Short Name', 'Phone Number')
for current_writer in writers.values():
current_writer.writerow(output_headers)
for current_contact in original_contacts:
if match_found := matcher.search(current_contact['Full Name']):
current_writer = writers[match_found[1].lower()]
current_writer.writerow(current_contact[column] for column in output_headers)
We could also have an example where we don't know in advance how many files we which to sort these entries into but it get a lot more complicated because we cannot use the with
statement directly.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.