Generating multiple scatter_matrix plots in the same chart with pandas

Question

I have two dataframes with identical column names. I would like to produce pairplot scatter plots to understand how the variables interact. I would like to plot the first dataframe with a different color than the second matrix. Is this possible? It seems like the scatter_matrix function overwrites the previous plot by default.

Why is my first-generated plot overwritten? How can I visualize both data frames at once using the scatter_matrix function?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dat = pd.DataFrame({'x%i' % ii: np.random.uniform(-1, 1, 100) for ii in range(3)})
dat2 = pd.DataFrame({'x%i' % ii: np.random.uniform(0, 1, 100) for ii in range(3)})
ax = pd.plotting.scatter_matrix(dat, c='orange')
pd.plotting.scatter_matrix(dat2, c='k')
# pd.plotting.scatter_matrix(dat2, c='k', ax=ax) # results in error
plt.savefig('example')

(The solution I desire should have two seperate point colors, with one set ranging from 0 to 1 and the other ranging from -1 to 1.)

Answer 1

If you are willing to use another library called seaborn and if I understood correctly, it can be done with sns.pairplot easily. You just need to concat both dataframe and create a column to use as hue with the name you want in the legend.

import seaborn as sns
sns.pairplot(pd.concat([dat.assign(hue='dat'), 
                        dat2.assign(hue='dat2')]), 
             hue='hue', 
             diag_kind='hist', 
             palette=['orange', 'k'])

Note: I find the diagonal not looking good with histogram in this case, I would rather use 'kde' instead of 'hist' for the parameter diag_kind , but it depends on what you want.

Answer 2

There seems to be an issue within the pandas source code where the scatter_matrix() function could technically work with the 2D-array of axes passed with the ax=ax option, but the call to fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) seems to flatten this array of axes into a 1D-list, which is then not compatible later on in the code with ax = axes[i, j] . Perhaps one should create a bug for this.

Apart from this, I'd suggest you to write your own code (loosely based on the original scatter_matrix() to get more control over what you're plotting. In the following example, you can see how you can gain more granular control over what you're actually plotting (eg figure size and margins, colours of plot objects and so on):

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

np.random.seed(2020)

dat = pd.DataFrame({'x%i' % ii: np.random.uniform(-1, 1, 100) for ii in range(3)})
dat2 = pd.DataFrame({'x%i' % ii: np.random.uniform(0, 1, 100) for ii in range(3)})


def create_plot(
    """
    see def scatter_matrix() in:
    https://github.com/pandas-dev/pandas/blob/526f40431a51e1b1621c30a4d74df9006e0274b8/pandas/plotting/_matplotlib/misc.py
    """
    axes,
    df=None,
    diagonal="hist",
    density_kwds=None,
    hist_kwds=None,
    marker='.',
    alpha=0.5,
    color="blue",
    **kwds
    ):
    """
    diagonal: either "hist", "kde" or "density"
    """
    range_padding = 0.05
    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}
    
    ## fix input data
    mask = pd.notna(df)

    boundaries_list = []
    for a in df.columns:
        values = df[a].values[mask[a].values]
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0
        boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))

    ## iterate over columns
    for i, a in enumerate(df.columns):
        for j, b in enumerate(df.columns):
            ax=axes[i,j] ## to abbreviate the code
            
            if i == j:
                values = df[a].values[mask[a].values]

                # Deal with the diagonal by drawing a histogram there.
                if diagonal == "hist":
                    ax.hist(values,color=color,alpha=alpha, **hist_kwds)

                elif diagonal in ("kde", "density"):
                    from scipy.stats import gaussian_kde

                    y = values
                    gkde = gaussian_kde(y)
                    ind = np.linspace(y.min(), y.max(), 1000)
                    ax.plot(ind, gkde.evaluate(ind),color=color, **density_kwds)

                ax.set_xlim(boundaries_list[i])

            else:
                common = (mask[a] & mask[b]).values

                ax.scatter(
                    df[b][common], df[a][common], marker=marker, alpha=alpha, color=color, **kwds
                )

                ax.set_xlim(boundaries_list[j])
                ax.set_ylim(boundaries_list[i])

            ax.set_xlabel(b)
            ax.set_ylabel(a)

            if j != 0:
                ax.yaxis.set_visible(False)
            if i != n - 1:
                ax.xaxis.set_visible(False)  
    return


## create the figure
fig=plt.figure(figsize=(5,5))

## get the matrix size from the first datasest
n = dat.columns.size
print(f"-- creating a {n}x{n} matrix of plots --")

## create the axes
axes={}
gs = mpl.gridspec.GridSpec(n,n,
    left=0.12,right=.97,
    bottom=0.12,top=.97,
    wspace=0,hspace=0,
)
for i, a in enumerate(dat.columns):
    for j, b in enumerate(dat.columns):
        axes[i,j] = plt.subplot(gs[i, j])


create_plot(axes,df=dat,color="blue")
create_plot(axes,df=dat2,color="red")
plt.show()

yields:

Generating multiple scatter_matrix plots in the same chart with pandas

Question

2 answers

solution1
8 ACCPTED 2020-07-10 12:27:23

solution2
3 2020-07-10 08:26:26

Generating multiple scatter_matrix plots in the same chart with pandas

Question

2 answers

solution1 8 ACCPTED 2020-07-10 12:27:23

solution2 3 2020-07-10 08:26:26

solution1
8 ACCPTED 2020-07-10 12:27:23

solution2
3 2020-07-10 08:26:26