在理解 Seaborn regplot 中的 x_bin 时遇到一些问题

Question

我使用 seaborn.regplot 绘制数据，但不太明白 regplot 中的误差条是如何计算的。 我将结果与手动计算得出的平均值和标准偏差进行了比较。 这是我的测试脚本。

import numpy as np
import pandas as pd
import seaborn as sn


def get_data_XYE(p):
    x_list = []
    lower_list = []
    upper_list = []
    for line in p.lines:
        x_list.append(line.get_xdata()[0])
        lower_list.append(line.get_ydata()[0])
        upper_list.append(line.get_ydata()[1])
    y = 0.5 * (np.asarray(lower_list) + np.asarray(upper_list))
    y_error = np.asarray(upper_list) - y
    x = np.asarray(x_list)
    return x, y, y_error

x = [37.3448,36.6026,42.7795,34.7072,75.4027,226.2615,192.7984,140.8045,242.9952,458.451,640.6542,726.1024,231.7347,107.5605,200.2254,190.0006,314.1349,146.8131,152.4497,175.9096,284.9926,116.9681,118.2953,312.3787,815.8389,458.0146,409.5797,595.5373,188.9955,15.7716,36.1839,244.8689,57.4579,94.8717,112.2237,87.0687,72.79,22.3457,24.1728,29.505,80.8765,252.7454,280.6002,252.9573,348.246,112.705,98.7545,317.0541,300.9573,402.8411,406.6884,56.1286,30.1385,32.9909,497.556,19.3606,20.8409,95.2324,108.6074,15.7753,54.5511,45.5623,64.564,101.1934,81.8459,88.286,58.2642,56.1225,51.2943,38.0649,63.5882,63.6847,120.495,102.4097,49.3255,111.3309,171.6028,58.9526,28.7698,144.6884,180.0661,116.6028,146.2594,199.8702,128.9378,423.2363,119.8537,124.6508,518.8625,306.3023,79.5213,121.0309,116.9346,170.8863,930.361,48.9983,55.039,47.1092,72.0548,75.4045,103.521,83.4134,142.3253,146.6215,121.4467,101.4252,68.4812,291.4275,143.9475,142.647,78.9826,47.094,204.2196,89.0208,82.792,27.1346,142.4764,83.7874,67.3216,112.9531,138.2549,133.3446,86.2659,45.3464,56.1604,43.5882,54.3623,86.296,115.7272,96.5498,111.8081,36.1756,40.2947,34.2532,89.1452,53.9062,36.458,113.9297,176.9962,77.3125,77.8891,64.807,64.1515,127.7242,119.6876,976.2324,322.8454,434.2883,168.6923,250.0284,234.7329,131.0793,152.335,118.8838,243.1772,24.1776,168.6327,170.7541,167.8444,75.9315,110.1045,113.4417,60.5464,66.8956,79.7606,71.6659,72.5251,77.513,207.8019,21.8592,35.2787,169.7698,146.5012,412.9934,248.0708,318.5489,104.1278,184.7592,108.0581,175.2646,169.7698,340.3732,570.3396,23.9853,69.0405,66.7391,67.9435,294.6085,68.0537,77.6344,433.2713,104.3178,229.4615,187.8587,78.1399,121.4737,122.5451,384.5935,38.5232,117.6835,50.3308,318.2513,103.6695,20.7181,321.9601,510.3248,13.4754,16.1188,44.8082,37.7291,733.4587,446.6241,21.1822,287.9603,327.2367,274.1109,195.4713,158.2114,64.4537,26.9857,172.8503]
y  = [37,40,30,29,24,23,27,12,21,20,29,28,27,32,23,29,28,22,28,23,24,29,32,18,22,12,12,14,29,31,34,31,22,40,25,36,27,27,29,35,33,25,25,27,27,19,35,26,18,24,25,37,52,47,34,39,40,48,41,44,35,36,53,46,38,44,23,26,26,28,27,21,25,21,20,27,35,24,46,34,22,30,30,30,31,26,25,28,21,31,24,27,33,21,31,33,29,33,32,21,25,22,39,31,34,26,23,18,20,18,34,25,20,12,23,25,21,21,25,31,17,27,28,29,25,24,25,21,24,27,23,22,23,22,22,26,22,19,26,35,33,35,29,26,26,30,22,32,33,33,28,32,26,29,36,37,37,28,24,30,25,20,29,24,33,35,30,32,31,33,40,35,37,24,34,29,27,24,36,26,26,26,27,27,20,17,28,34,18,20,20,18,19,23,20,22,25,32,44,41,39,41,40,44,36,42,31,32,26,29,23,29,29,28,31,22,29,24,28,28,25]
xbreaks = [13.4754, 27.1346,  43.5882,  58.9526,  72.79,  89.1452,  110.1045,  131.0793,  158.2114,  180.0661,  207.8019,  234.7329,  252.9573,  300.9573,  327.2367,  348.246,  412.9934,  434.2883,  458.451,  518.8625,  595.5373,  640.6542,  733.4587,  815.8389,  930.361,  976.2324]
df = pd.DataFrame([x,y]).T
df.columns = ['x','y']

# Check the bin average and std using agge
bins = pd.cut(df.x,xbreaks,right=False)
t = df[['x','y']].groupby(bins).agg({"x": "mean", "y": ["mean","std"]})
t.reset_index(inplace=True)
t.columns = ['range_cut','x_avg_cut','y_avg_cut','y_std_cut']
t.index.name ='id'

# Get the bin average from 
g = sns.regplot(x='x',y='y',data=df,fit_reg=False,x_bins=xbreaks,seed=seed)
xye = pd.DataFrame(get_data_XYE(g)).T
xye.columns = ['x_regplot','y_regplot','e_regplot']
xye.index.name = 'id'

t2 = xye.merge(t,on='id',how='left')
t2

可以看出两种方式的 y 和 e 是不同的。 我知道默认的 x_ci 或 x_estimator 可能会影响 regplot 的结果，但我仍然无法通过删除每个 bin 中的一些最低和/或最高值来获得 excel 中的这些值。

Answer 1

在seaborn.regplot ，x_bins 是每个 bin 的中心，原始 x 值分配给最近的 bin 值。 而在pandas.cut ，中断定义了 bin边缘。

在理解 Seaborn regplot 中的 x_bin 时遇到一些问题

问题描述

1 个解决方案

解决方案1
0 已采纳 2021-07-19 16:32:42

在理解 Seaborn regplot 中的 x_bin 时遇到一些问题

问题描述

1 个解决方案

解决方案1 0 已采纳 2021-07-19 16:32:42

解决方案1
0 已采纳 2021-07-19 16:32:42