[英]Convert SAV into CSV
I tried to convert a SAV file into CSV by using the following code我尝试使用以下代码将 SAV 文件转换为 CSV
data = pd.io.stata.read_stata("C:/Users/Nicola/Desktop/Relevant projects activities ACF/BRACED Final Evaluation/Evaluations/CSI_compil_2017.sav")
writer = pd.ExcelWriter('C:/Users/Nicola/Desktop/Baseline.xlsx')
data.to_excel(writer, 'data')
data.to_csv('changed_to_csv.csv')
writer.save()
The output I am getting is the following我得到的输出如下
ValueError: Version of given Stata file is not 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)
ValueError: 给定 Stata 文件的版本不是 104、105、108、111 (Stata 7SE)、113 (Stata 8/9)、114 (Stata 10/11)、115 (Stata 12)、117 (Stata 13) 或118 (Stata 14)
Is there a better snippet I could use to perform this conversion more efficiently?有没有更好的片段可以用来更有效地执行此转换? Thanks
谢谢
Please see the following answer : https://stackoverflow.com/a/20873154/5999386请参阅以下答案: https : //stackoverflow.com/a/20873154/5999386
In short, using import pandas.rpy.common as com
to use R capabilities to parse .sav file into Pandas's data frame.简而言之,使用
import pandas.rpy.common as com
来使用 R 功能将 .sav 文件解析为 Pandas 的数据框。
I just managed to convert it with this snippet (R Kernel):我只是设法用这个片段(R内核)转换它:
library(foreign)
write.table(read.spss("C:/Users/Nicola/Desktop/Relevant projects activities ACF/BRACED Final Evaluation/Evaluations/CSI_compil_2017.sav"), file="from_sav_data.csv", quote = FALSE, sep = ",")
import pandas as pd
import pyreadstat as py
df, meta = py.read_sav('file.SAV')
writer=pd.ExcelWriter ("file2.xlsx")
df.to_excel(writer, 'df')
df.to_csv('file2.csv')
writer.save()
Here is my class that finds all .sav
files in subfolders and converts them to the .csv
format.这是我的课程,它在子文件夹中查找所有
.sav
文件并将它们转换为.csv
格式。 The class successfully coped with files up to 2 GB in size.该课程成功处理了最大 2 GB 的文件。 If you only need to convert one file, just use the
convert_sav_to_csv(file_path, dir_to_save=None)
function.如果您只需要转换一个文件,只需使用
convert_sav_to_csv(file_path, dir_to_save=None)
函数。 I used this answer a lot.我经常使用这个答案。
import glob
import pandas as pd
import os
import enum
import errno
import pyreadstat
# Enum for size units
class SizeUnit(enum.Enum):
BYTES = 1
KB = 2
MB = 3
GB = 4
class ConverterSavCsv:
@staticmethod
def convert_unit(size_in_bytes, unit):
""" Convert the size from bytes to other units like KB, MB or GB"""
if unit == SizeUnit.KB:
return size_in_bytes / 1024
elif unit == SizeUnit.MB:
return size_in_bytes / (1024 * 1024)
elif unit == SizeUnit.GB:
return size_in_bytes / (1024 * 1024 * 1024)
else:
return size_in_bytes
@staticmethod
def get_file_size(file_name, size_type=SizeUnit.BYTES):
""" Get file in size in given unit like KB, MB or GB"""
size = os.path.getsize(file_name)
return ConverterSavCsv.convert_unit(size, size_type)
@staticmethod
def find_all_sav_files(upper_dir_path):
if not os.path.isdir(dir_path):
raise NotADirectoryError(dir_path)
files = glob.glob(upper_dir_path + '/**/*.sav', recursive=True)
files_and_sizes = []
for elem in files:
elem_size = ConverterSavCsv.get_file_size(elem, SizeUnit.MB)
files_and_sizes.append((elem, elem_size))
result_df = pd.DataFrame(files_and_sizes, columns=['Path', 'Size'])
result_df = result_df.sort_values(by=['Size'], ascending=False)
result_df.to_excel(os.path.join(upper_dir_path, r'spss_files_sizes.xlsx'), index=False)
return result_df
@staticmethod
def _dir_filepath_prepare_before_conversion(file_path, dir_to_save=None):
if not os.path.isfile(file_path):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
base_filename = os.path.basename(file_path)
base_filename = os.path.splitext(base_filename)[0]
if dir_to_save is None:
current_dir = os.path.dirname(file_path)
dir_to_save = os.path.join(current_dir, r'ready_csv')
os.makedirs(dir_to_save)
path_to_save = os.path.join(dir_to_save, base_filename+r'.csv')
return path_to_save
@staticmethod
def convert_sav_to_csv(file_path, dir_to_save=None):
path_to_save = ConverterSavCsv._dir_filepath_prepare_before_conversion(file_path, dir_to_save)
_, meta_start = pyreadstat.read_sav(file_path, metadataonly=True)
number_of_rows = meta_start.number_rows
reader = pyreadstat.read_file_in_chunks(pyreadstat.read_sav, file_path, chunksize=1000)
cnt = 0
for df, meta in reader:
if number_of_rows is not None:
print('Processing rows', cnt * 1000, '-', (cnt + 1) * 1000, 'of', number_of_rows)
if cnt > 0:
write_mode = "a"
header = False
else:
write_mode = "w"
header = True
df.to_csv(path_to_save, mode=write_mode, header=header)
cnt += 1
return ConverterSavCsv.get_file_size(path_to_save, SizeUnit.MB)
@staticmethod
def convert_all_sav_files_to_csv(sav_files_df, dir_to_save=None):
if sav_files_df.empty \
or 'Path' not in sav_files_df.columns \
or 'Size' not in sav_files_df.columns:
raise ValueError('Problems with input Pandas dataframe in convert_all_sav_files_to_csv function')
if dir_to_save is not None:
if not os.path.isdir(dir_to_save):
os.makedirs(dir_to_save)
# raise NotADirectoryError(errno.ENOENT, os.strerror(errno.ENOENT), dir_to_save)
counter = 1
files_total = sav_files_df.shape[0]
for index, row in sav_files_df.iterrows():
base_filename = os.path.basename(row['Path'])
base_filename = os.path.splitext(base_filename)[0]
print('Converting file', base_filename, '-', counter, 'of', files_total, ', with size', row['Size'], 'MB')
ConverterSavCsv.convert_sav_to_csv(row['Path'], dir_to_save)
counter = counter + 1
print()
return files_total
if __name__ == "__main__":
dir_path = r'C:\Job\sav_files'
dir_to_save_path = r'C:\Job\sav_files\csv_ready_files'
# if dir_to_save_path is None,
# it creates subdirectory ready_csv and saves there
# find all SAV files in the directory and convert them to CSV
converter = ConverterSavCsv()
df_sav_files_path_size = converter.find_all_sav_files(dir_path)
num_of_files = converter.convert_all_sav_files_to_csv(df_sav_files_path_size, dir_to_save_path)
print(num_of_files, '.sav files converted to .csv and saved')
# for single file
# use function convert_sav_to_csv(file_path, dir_to_save=None)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.