简体   繁体   中英

Merge two datasets in Python

I have two sets of xy data, whose x values should be merged. To illustrate, the first set looks something like this:

0.5;3.4
0.8;3.8
0.9;1.2
1.3;1.1
1.9;2.3

And the second set like this:

0.3;-0.2
0.8;-0.9
1.0;0.1
1.5;1.2
1.6;6.3

The data is in two separate csv files. I would like to merge both files into one, so that the x values are in order, and the y values appear in two columns with their (linearly) interpolated values ( y1 and y2 ) completed. The second column contains the y values of the first dataset (plus interpolated values) and the third column the y values of the second dataset.

0.3;y1;-0.2
0.5;3.4;y2
0.8;3.8;-0.9
0.9;1.2;y2
1.0;y1;0.1
1.3;1.1;y2
1.5;y1;1.2
1.6;y1;6.3
1.9;2.3;y2

My only idea so far would be to read the data into numpy arrays, join them together, sort the values and calculate the average of preceding and following values, in case the value is empty.

Is there a more elegant way to do this in Python?

Edit: Here is my attempt. It works and delivers the result I imagined, although the script is quite long.

#-*- coding: utf-8 -*-

import numpy as np
from matplotlib import pyplot as plt
from scipy.interpolate import interp1d
import csv

# Read data files and turn them into numpy array for further processing
def read_datafile(file_name):
    data = np.loadtxt(file_name, delimiter=";")
    return data

data1 = read_datafile("testcsv1.csv")
data2 = read_datafile("testcsv2.csv")

# Add empty column at the appropriate position
emptycol1 = np.empty((len(data1), 3))
emptycol1[:] = np.nan
emptycol2 = np.empty((len(data2), 3))
emptycol2[:] = np.nan
emptycol1[:,:-1] = data1
emptycol2[:,[0, 2]] = data2

# Merge and sort the data sets. Create empty array to add final results
merged_temp = np.concatenate((emptycol1, emptycol2))
merged_temp = np.array(sorted(merged_temp, key = lambda x: float(x[0])))
merged = np.empty((1, 3))

# Check for entries where the x values already match. Merge those into one row
i = 0
while i < len(merged_temp)-1:
    if merged_temp[i, 0] == merged_temp[i+1, 0]:
        newrow = np.array([merged_temp[i, 0], merged_temp[i, 1], merged_temp[i+1, 2]])
        merged = np.vstack((merged, newrow))
        i += 2
    else:
        newrow = np.array([merged_temp[i, 0], merged_temp[i, 1], merged_temp[i, 2]])
        merged = np.vstack((merged, newrow))
        i += 1

# Check for so far undefined values (gaps in the data). Interpolate between them (linearly)
for i in range(len(merged)-1):
    # First y column
    if np.isnan(merged[i, 1]) == True:
        # If only one value is missing (maybe not necessary to separate this case)
        if (np.isnan(merged[i-1, 1]) == False) and (np.isnan(merged[i+1, 1]) == False):
            merged[i, 1] = (merged[i-1, 1] + merged[i+1, 1])/2
        # If two or more values are missing
        elif np.isnan(merged[i, 1]) == True:
            l = 0
            while (np.isnan(merged[i+l, 1]) == True) and (i+l != len(merged)-1):
                l += 1
            x1 = np.array([i-1, i+l])                       # endpoints
            x = np.linspace(i, i+l-1, l, endpoint=True)     # missing points
            y = np.array([merged[i-1, 1], merged[i+l, 1]])  # values at endpoints
            f = interp1d(x1, y)                             # linear interpolation
            for k in x:
                merged[k, 1] = f(k)
    # Second y column
    if np.isnan(merged[i, 2]) == True:
        # If only one value is missing
        if (np.isnan(merged[i-1, 2]) == False) and (np.isnan(merged[i+1, 2]) == False):
            merged[i, 2] = (merged[i-1, 2] + merged[i+1, 2])/2
        # If two or more values are missing
        elif np.isnan(merged[i, 2]) == True:
            l = 0
            while (np.isnan(merged[i+l, 2]) == True) and (i+l != len(merged)-1):
                l += 1
            x1 = np.array([i-1, i+l])                       # endpoints
            x = np.linspace(i, i+l-1, l, endpoint=True)     # missing points
            y = np.array([merged[i-1, 2], merged[i+l, 2]])  # values at endpoints
            f = interp1d(x1, y)                             # linear interpolation
            for k in x:
                merged[k, 2] = f(k)

# Remove lines which still have "nan" values (beginning and end). This could be prevented by an extrapolation
merged = merged[~np.isnan(merged).any(axis=1)]
merged = np.delete(merged, (0), axis=0)

# Write table to new csv file in the same directory
with open("testcsv_merged.csv", "w") as mergedfile:
    writer = csv.writer(mergedfile)
    [writer.writerow(r) for r in merged]

I would use pandas for this type of processing:

import pandas as pd
#I assumed you have no headers in the data files
df1 = pd.read_csv('./dataset1.txt',sep=';',header=None)
df2 = pd.read_csv('./dataset2.txt',sep=';',header=None)
#Join the datasets using full outer join on the first column in both datasets
df_merged = df1.merge(df2, on=0, how='outer')
#Fill the nulls with the desirable values in this case the average of the column
df_merged['1_x'].fillna(df_merged['1_x'].mean(),inplace=True)
df_merged['1_y'].fillna(df_merged['1_y'].mean(),inplace=True)

Output:

print(df_merged)
    0   1_x 1_y
0   0.5 3.4 y2
1   0.8 3.8 -0.9
2   0.9 1.2 y2
3   1.3 1.1 y2
4   1.9 2.3 y2
5   0.3 y1  -0.2
6   1.0 y1  0.1
7   1.5 y1  1.2
8   1.6 y1  6.3

You can change the column names easily:

df_merged.columns = ['col1','col2','col3']

You can also sort values easily using sort_values method:

df_merged.sort_values('col1')

Finally, you can convert this final DataFrame into a numpy array easily using:

import numpy as np
np.array(df_merged)

A one liner : dfi = pd.merge(df1,df2,'outer',0).set_index(0).sort_index().interpolate()

In [383]: dfi
Out[383]: 
      1_x   1_y
0              
0.3   NaN -0.20
0.5  3.40 -0.55
0.8  3.80 -0.90
0.9  1.20 -0.40
1.0  1.15  0.10
1.3  1.10  0.65
1.5  1.50  1.20
1.6  1.90  6.30
1.9  2.30  6.30

A complete pandas version + numpy interpolate for better tuning at the edges :

#df1 = pd.read_clipboard(header=None,sep=';')
#df2 = pd.read_clipboard(header=None,sep=';')

import pylab as pl

df = pd.merge(df1,df2,'outer',0).sort_values(0)
df['y1']=scipy.interpolate.interp1d(*df1.values.T,fill_value='extrapolate')(df[0])
df['y2']=scipy.interpolate.interp1d(*df2.values.T,fill_value='extrapolate')(df[0])

ax=pl.gca()
df1.set_index(0).plot(lw=0,marker='o',ax=ax)
df2.set_index(0).plot(lw=0,marker='o',ax=ax)
df.set_index(0).loc[:,['y1','y2']].plot(ax=ax)    
pl.show()

plots :

在此处输入图片说明

data :

In [344]: df1
Out[344]: 
     0    1
0  0.5  3.4
1  0.8  3.8
2  0.9  1.2
3  1.3  1.1
4  1.9  2.3

In [345]: df2
Out[345]: 
     0    1
0  0.3 -0.2
1  0.8 -0.9
2  1.0  0.1
3  1.5  1.2
4  1.6  6.3

In [346]: df
Out[346]: 
     0  1_x  1_y         y1         y2
5  0.3  NaN -0.2 -20.713281  -0.200000
0  0.5  3.4  NaN   3.400000  -3.021563
1  0.8  3.8 -0.9   3.800000  -0.900000
2  0.9  1.2  NaN   1.200000  -0.092830
6  1.0  NaN  0.1  -0.265527   0.100000
3  1.3  1.1  NaN   1.100000  -1.960323
7  1.5  NaN  1.2   3.760937   1.200000
8  1.6  NaN  6.3   4.701230   6.300000
4  1.9  2.3  NaN   2.300000  44.318059

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM