簡體   English   中英

SAS計算CDF為值的百分比,該值<=加權變量中的任何已定義值

[英]SAS calculate CDF as the percentage of values <= any defined values in a weighted variable

對於數據集中的加權變量“ leadb”,其想法是從值B開始,然后找到CDF作為值<= B的百分比以及該百分比的置信區間。

我可以使用PROC SURVEYMEAN為指定的任何單個值執行此操作,但是我不知道如何讓SAS同時給我多個百分比。 如果我想計算介於0和max + 1之間的值的百分比(間隔為(max + 1)/ 100),應該如何修改代碼?

謝謝!

data Test;
  set data2012;
  if leadb <= 5 then lead5 = 1;
  else if leadb ne . then lead5 = 0;
  else lead5=.;
  if (gender = 2 and age >= 16 and age <= 49) then wocba = 1;
  else wocba = 0;
run;

proc surveymeans data=Test;
  strata stratum;
  cluster psu;
  weight weight2;
  var lead5;
  domain wocba; 
  ods output domain=mystats;
run; 

data mystats;
  set mystats;
  where wocba = 1;
  lower = max(lowerclmean,0); /* since lower bound might be zero, but proportion is >= 0 */
  upper = max(upperclmean,0); 
run;

proc print data=mystats;
  title "Proportion of blood lead values >= 5 for women of child-bearing age (16-49)";
  title2 "Weighted by rates of giving birth by age and race";
  title3 "With a 95% confidence interval";
run;

要計算考慮了加權調查設計的累積分布函數,您需要執行幾個步驟。

  1. 將數據綁定到所需的時間間隔中。
  2. 通過PROC SURVEYFREQ運行合並的數據以獲取每個間隔的加權百分比
  3. 使用加權百分比來計算累積加權百分比
  4. 計算95%置信區間

(注意:我不確定使用按百分比估算的StdErr在累計百分比上的合法性。您必須自己決定。但是如果是我,我會使用它。)

請參見下面的代碼。 我希望這有幫助!

*** GENERATE TEST DATA ***;
data have;
    do i=1 to 200;

        leadb = ranexp(123321) * 5;

        *** SURVEY VARIABLES ***;
        stratum = mod(i, 12) + 1;
        if ranuni(456654) > 0.5 then psu = 1;
        else psu = 2; 
        weight2 = ranuni(1991) * 1000;

        *** DOMAIN VARIABLE ***;
        if ranuni(789987) > 0.7 then wocba = 1;
        else wocba = 0;

        output;
    end;
run;


*** GET MIN/MAX ***;
proc summary data=have;
    var leadb;
    output out=stats min=min  max=max ;
run;


*** USE MIN/MAX TO CREATE INTERVALS TO BIN THE DATA BY APPLYING A FORMAT ***;
*** CREATE A CONTROL DATASET THAT WILL BE CONVERTED INTO A FORMAT ***;
data control_dset;
    set stats (drop=_type_ _freq_);
    min=floor(min);
    max=ceil(max);
    *** CALCULATE INTERVAL BASED ON MIN AND MAX OF DATA ***;
    interval = round( (max - min + 1)/100 , 0.1);

    fmtname = 'leadfmt';
    type = 'n';
    eexcl = 'Y';    *** END VALUE IS EXCLUDED FROM RANCE ***;
    do i = min to max by interval;
        start = i;
        end = i + interval;

        label = start;
        output;
    end;
run;

*** CONVERT CONTROL DATASET TO A FORMAT ***;
proc format cntlin=control_dset;
run;

*** APPLY FORMAT TO BIN THE DATA INTO INTERVALS ***;
data start;
    set have;
    lead_interval = put(leadb, leadfmt.) + 0;
run;


ODS TRACE ON / LISTING;

*** USE SURVEMEANS TO GET CUMULATIVE FREQUENCIES FOR BINNED CATEGORIES ***;
*** NOTE: SURVEYFREQ DOES -NOT- HAVE A DOMAIN STATEMENT ***;
*** INSTEAD, PUT DOMAIN VARIABLE IN TABLE STATEMENT AND THEN GET APPROPRIATE ROW OR COL PERCENT FROM OUTPUT ***;
proc surveyfreq data=start;
    ods output summary=summary; 
    ods output crosstabs=crosstabs; 
    strata stratum;
    cluster psu;
    weight weight2;
    *** USE THE DOMAIN / SUBPOPULATION VARIABLE IN THE TABLE STATEMENT ***; 
    tables wocba * lead_interval / row ;
run; 

ods trace off;


*** CALCULATE CUMULATIVE PERCENT ***;
data really_close;
    set crosstabs;
    retain CumRowPercent 0;

    *** SUBSET ROW PERCENTS FOR DOMAIN, ALSO DELETE IF COUNT = 0 ***;
    if wocba = 1 and strip(F_lead_interval) not= 'Total' and frequency > 0;

    *** CALCULATE CUMULATIVE PERCENT ***;
    CumRowPercent = sum(RowPercent, CumRowPercent);

    drop Percent StdErr StdDev;
run;


*** I AM NOT SURE HOW LEGITIMATE IT IS TO USE USE THE RowStdErr WITH THE CUMULATIVE ROW PERCENTS ***;
*** CONSULT YOUR FAVORITE STATISTICIAN FOR A FIRM OPINION!!! ***;

*** GET T-STATISTIC TO CALCULATE 95% CONFIDENCE INTERVAL ***;
data tstat;
    *** SUMMARY STATISTICS FROM PROC SURVEYFREQ ***;
    set summary end=lastrec;
    retain nclus nstrat;

    if index( upcase(Label1), 'STRATA') then nstrat = nvalue1;
    else if index( upcase(Label1), 'CLUSTER') then nclus = nvalue1;

    *** DEGREES OF FREEDOM = NUMBER OF CLUSTERS - NUMBER OF STRATA ***;
    df = nclus - nstrat;

    *** GET T-STATISTIC FOR 95% CONFIDENCE INTERVAL ***;
    tstat = abs( quantile('T', 0.05/2, df) );

    if lastrec;
    drop label1 cvalue1 nvalue1;
run;


*** CALCULATE 95% CI ***;
data want;
    set really_close ;
    if _N_ =1 then set tstat;

    CumRowPct_Lower = CumRowPercent - tstat * RowStdErr;
    CumRowPct_Upper = CumRowPercent + tstat * RowStdErr;

    if CumRowPct_Lower < 0 then CumRowPct_Lower = 0;
    if CumRowPct_Upper > 100 then CumRowPct_Upper = 100;

    keep lead_interval CumRowPercent CumRowPct_Lower CumRowPct_Upper;
run;

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM