*
;
options nosource;

/*  DO_ITALL - CREATES SAMPLES BY DIFFERENT SAMPLE SIZES
               AND TESTS ALL SAMPLES BY GIVEN TEST DISTRIBUTIONS
               WITH GIVEN ALPHA TO CALCULATE CONFIDENCE INTERVALS
               AND PRODUCES GRAPHS WITH WISKERS-BOX PLOTS;
               SELECTION OF SAMPLES BY UNIFORM RANDOM FUNCTION
               RANUNI(0).

    Written:  October 6 and 7, 2000
    Developed using SAS 6.12 for Windows
    Author:   Arnold Schick

    Procs:    PROC MEANS, PROC SORT, PROC GPLOT, PROC APPEND
    Other:    SAS MACRO language, %SYSFUNC MACRO functions
    Macros:   INTERVAL and some utilities, like DATE, DATUM, PURGE,
              all included in the source here.
              INTERVALL can also be used to test samples separatly
              (see explanation of macro INTERVAL below).

    Created Data Sets: RESULT with statistic values of every sample.
                      _LABEL_ to annotate the plot.

    Note:     Do not use _ONE or _TWO or _THREE or _MINMAX_ or
              RESULT or _LABEL_ as a data set name.


    Macro Call: %DO_ITALL(DATA,VAR,ALPHA,MIN,MAX,STEP,DISTRIBUTION);

MACRO VARIABLE      DESCRIPTION
------------------+-------------------------------------------------
 In Request:

 DATA               Name of SAS data set with input data.
                    If the value is SHOWGRAPH or ZEIGEGRAFIK
                    then it will jump to the plot part and issue
                    the last produced PLOT only.

 VAR                Analyze variable name of within SAS data set DATA.

 ALPHA              Specifies that confidence intervals
                    are to be  100(1-p) percent confidence intervals,
                    where 0.0001 < p < 0.9999.

 MIN                Minimum sample size of first created sample
                    SAS data set.

 MAX                Maximum sample size of last created sample
                    SAS data set.

 STEP               Step size of next created sample SAS data sets
                    Number of created sample SAS data sets
                    are INT((MAX-MIN)/STEP) from MIN to MAX by STEPs.
                    Data set names (library WORK) of created samples:
                    WORK.SAMP

 DISTRIBUTION       Name of test distribution
                    where:
                      T or STUDENT or
                      GOSSET            t- or Student distribution
                      NORMAL            normal distribution
                      F or FISHER       F- or Fisher distributtion
                      CHI or CHI SQUARE Chi-Square distribution
                                        note: do not use CHI-SQUARE
--------------------------------------------------------------------


    Macro Call: %INTERVAL(DATA,RESULT,VAR,N,ALPHA,DISTRIBUTION);

MACRO VARIABLE      DESCRIPTION
------------------+-------------------------------------------------
 In Request:

 DATA               Name of SAS data set with input data.

 RESULT             Name of SAS data set with result data.

 VAR                Analyze variable name of within SAS data set DATA.

 N                  Number of observations in sample SAS data set RESULT

 ALPHA              Specifies that confidence intervals
                    are to be  100(1-p) percent confidence intervals,
                    where 0.0001 < p <0.9999.

 DISTRIBUTION       Name of test distribution
                    where
                      T or STUDENT or
                      GOSSET            t- or Student distribution
                      NORMAL            normal distribution
                      F or FISHER       F- or Fisher distributtion
                      CHI or CHI SQUARE Chi-Square distribution
                                        note: do not use CHI-SQUARE
--------------------------------------------------------------------

Example:

data one;
     keep weight;
     do i=1 to 1700;
        weight = 40 + (80-40)*ranuni(0);
        output;
     end;
run;

     *   data,variable,alpha,min,max,step,test distribution ;
     *     |    |      |      |   |  |    |                 ;
%do_itALL(one,weight  ,3    , 12,182,40  ,normal);
%do_itALL(one,weight  ,3    , 15, 85,20  ,chi square);  * use:chi or chi square >>not<< chi-square;
%do_itALL(one,weight  ,3    , 15, 85,20  ,student);
%do_itALL(one,weight  ,3    , 15, 85,20  ,fisher);


or call INTERVAL separatly:

     *   data,result,variable,sample size,alpha,test distribution ;
     *     |    |      |      |           |     |  ;
%interval(one,mydata,weight  ,29         ,3    ,f);



For more information:

 Arnold Schick

 e-mail: schick@yours.com

 If you  find an error-condition  (it is provided 'as it is')
 please let me know about this error-condition.  And when you
 have good tips for better formulation in SAS, let it also know.

  */


%macro purge(ds);   /* utilitiy macro to purge a data set */
  %if %sysfunc(exist(&ds))=1 %then %do;
      proc delete data=&ds;
      run;
  %end;
%mend;

%macro interval(data,ds_name,var,n,alpha,distrib);
  %global min max;

  data _one;                /* create uniform order */
    set &data;
    _order=ranuni(0);
  run;

  proc sort data=_one;      /* sort by order */
     by _order;
  run;

  data &ds_name (label="&distrib"); /* create data set with sample of size &n */
     set _one (drop=_order);
     if _N_ <= &n;
  run;

  %purge(_one);            /* purge utility data set _ONE */

  %let obs=0;              /* check sample size gt 0  */
  %let dsid=%sysfunc(open(&ds_name,I));
  %if &dsid > 0 %then %do;
      %let obs=%sysfunc(attrn(&dsid,NLOBS));
      %let dsid=%sysfunc(close(&dsid));
  %end;
  %if &obs=0 %then %goto fin;

  proc sort data=&ds_name; /* sort variable by its order */
       by &var;
  run;
                           /* find out statistic values  */
  proc means data=&ds_name noprint;
       output out=_two (drop= _type_ _freq_);
  run;

  data _null_;             /* look up median of sample   */
    set &ds_name;
    if _N_ > int(&n/2+0.5) then do;
      call symput('median',&var);
      stop;
    end;
  run;

  data _three;              /* calculate confidence interval CI */
    set _two end=last;
    length ds_name $8. distrib $18. ;
    retain mean t std n min max;
    keep ci_u ci_l t std mean n alpha min max median ds_name distrib
         DF var;

    if compress(_stat_) = "MEAN" then mean= &var;
    if compress(_stat_) = "N"    then n   = &var;
    if compress(_stat_) = "STD"  then std = &var;
    if compress(_stat_) = "MIN"  then min = &var;
    if compress(_stat_) = "MAX"  then max = &var;

    if last then do;
      distrib = "&distrib &alpha% CI";
      var = "&var";
      _a_ = 1-(&alpha/100/2);
      alpha = α
      %if %upcase(&distrib)=%STR(T) or
          %upcase(&distrib)=STUDENT %then %do;
        DF = N-1;
        t=tinv(_a_,DF);           /* t-Distribution          */
      %end;
      %if %upcase(&distrib)=NORMAL %then %do;
        DF = . ;
        t=probit(_a_);             /* Normal-Distribution     */
      %end;
      %if %upcase(&distrib)=CHI or
          %upcase(&distrib)=%STR(CHI SQUARE) %then %do;
        DF = 1;
        t=cinv(_a_,DF);             /* Chi-Square-Distribution */
      %end;
      %if %upcase(&distrib)=FISHER or
          %upcase(&distrib)=%STR(F) %then %do;
        DF = N-2;
        t=finv(_a_,1,DF);         /* Fisher-Distribution     */
      %end;
      ci_u = mean + t*std/sqrt(n); /* upper CI */
      ci_l = mean - t*std/sqrt(n); /* lower CI */
      put _a_ 9.3 t 12.6 ci_l 10.3 ci_u 10.3 n 6. DF 6. "    WORK.&ds_name";
      median = &median;
      ds_name = "&ds_name";
      call symput('min',min);
      call symput('max',max);
      output;
    end;
  run;

                                   /* add to the result     */
  proc append base=result data=_three force;
  run;

  %purge(_two);                    /* clean up library WORK */
  %purge(_three);

  %fin : ;

%mend;


%macro do_itALL(data,var,a,sampMin,sampMax,sampStep,distrib);

  %macro date;                     /* look up actual date and time */
    %global DA TI;
    data _NULL_;
      x = put(Datepart(datetime()),worddatx.);
      y = put(TIME(),TIME8.);
      call symput('DA',x);
      call symput('TI',y);
    run;
  %mend;

  %macro datum;  /* write date and time, call with percent sign !! */
    &DA &TI
  %mend;

  %if %upcase(&data)=SHOWGRAPH or %UPCASE(&data)=ZEIGEGRAFIK
      %then %do;
         proc sort data=result ( where=(n is not missing) );
              by n;
         run;
         data _NULL_;
            set result end=last;
            retain _min;
            if _N_=1 then do;
               _min=n;
               call symput('sampMin',n);
            end;
            if last  then do;
               call symput('sampStep',int((n-_min)/_N_) );
               call symput('sampMax',n);
               call symput('var',var);
            end;
         run;
         %goto graph;
      %end;

                       /* check macro parameters, if these exist   */
  %if &data= or &var=  or  &a=  %then %do;
     %put Info: Please check macro parameters, some are missing;
     %goto fin;
  %end;

                        /* check sample size parameters            */
  %if &sampStep=  or &sampStep < 1 %then %do;
     %put Info: Stepsize &sampStep call parameter must be ge 1;
     %goto fin;
  %end;

  %if &sampMin=  or &sampMax=  or &sampMax < &sampMin %then %do;
     %put Info: no correct sample size min &sampMin or max &sampMin call parameter;
     %goto fin;
  %end;

  data _NULL_;          /* check parameter: test distribution      */
     check = upcase("&distrib");
     if check in ("CHI" "CHI SQUARE" "F" "FISHER" "T" "STUDENT" "GOSSET" "NORMAL")
        then ok=1;
        else ok=0;
     call symput('ok',ok);
  run;
  %if not &ok %then %do;
     %put Info: Test Distribution &distrib not supported or unknown;
     %goto fin;
  %end;

                        /* now, lets go                            */
  %purge(result);       /* clean up previous results, if any exist */

  data _NULL_;          /* log the header title from this run      */
    put " "; put " ";
    put " Creation of Samples from Data=%UPCASE(&data) with variable=%UPCASE(&var), Alpha=&a% ";
    put "              min  sample size=&sampMin";
    put "              max  sample size=&sampMax, step sample size=&sampStep";
    put "   test distribution=&distrib";
    put " ";
    put "  t-Quantil     t       lower CI  upper CI    N    DF   Data Set Name";
  run;

                        /* main loop for all samples with different sizes by &sampStep */
  %do n=&sampMin %to &sampMax %by &sampStep;
    %let ds_name=Samp&n.%substr(&distrib,1,1);
    %interval(&data,&ds_name,&var,&n,&a,&distrib);
  %end;

  %graph : ;
                        /* utilities to prepare the PLOT */
  proc means data=result noprint;
       var n max;
       output out=_MINMAX_ min=min_x min_y max=max_x max_y;
  run;

  data _NULL_;          /* utlitiy to find out MIN MAX area on PLOT */
    set _MINMAX_;
    dx = (max_x-min_x)*0.1;
    dy = (max_y-min_y)*0.1;
    call symput('min_x',min_x-dx); call symput('min_y',min_y-dy);
    call symput('max_x',max_x+dx); call symput('max_y',max_y+dy);
    call symput('dx',dx);          call symput('dy',dy);
  run;
  %purge(_MINMAX_);

  data _label_;         /* utility to annotate the PLOT */
    set result end=last;
    length function $8. text $18. ;
    retain x 0 y 0 function ' ' xsys '2' ysys '2' style "zapf   " median
           size 2 position "3" text min  max ci_l ci_u n;
    keep function x y xsys ysys style size position text;
    text = ds_name;
    x=n;             y=min;        function="MOVE";  output;
                     y=ci_l;       function="DRAW";  output;
    x=n-&sampStep/8; y=ci_l;                         output;
                     y=ci_u;                         output;
    x=n;                                             output;
                     y=max;                          output;
                     y=ci_u;       function="MOVE";  output;
    x=n+&sampStep/8;               function="DRAW";  output;
                     y=ci_l;                         output;
    x=n;                                             output;
    x=n-&sampStep/8; y=median;     function="MOVE";  output;
    x=n+&sampStep/8;               function="DRAW";  output;
    x=n;             y=min;        function="LABEL"; style="simplex"; position="5"; size=0.56; output;
    x=n;             y=max;        function="LABEL"; style="simplex"; position="2"; text=distrib; output;
    size=2; style="zapf"; position="3";
  run;

                        /* SYMBOL statements for the PLOT */
  symbol1 i=join r=2 color=red    w=1 L=2;
  symbol2 i=none r=1 color=blue   w=2 v=plus  h=3;
  symbol3 i=none r=2 color=white;

                        /* range on X-AXIS */
  %let n_mi=%eval(&sampMin-&sampStep/2);
  %let n_ma=%eval(&sampMax+&sampStep/2);

                        /* AXIS statements for the PLOT   */
  axis1 label=("&var");
  axis2 label=("Sample Size N") order=(&n_mi to &n_ma by &sampStep);

                        /* TITLE statements for the PLOT  */
  title1  h=0.45 cm  f=zapf "Sample Tests";
  title2  h=0.27 cm  f=simplex "test distributions and CI niveau are noted at the top";
  title3  h=0.27 cm  f=simplex "data set names are noted at the bottom";

                        /* FOOTNOTE statements for the PLOT */
  %date;
  footnote1 j=r h=0.25 cm f=simplex "produced: %datum ";
  footnote2 h=0.3 cm " ";

                        /* produce the PLOT */
  proc gplot data=result;
    plot ( ci_u ci_l mean min max ) * n / overlay vaxis=axis1
                                                  haxis=axis2
                                          annotate=_label_ ;
  run; quit;

  %goto next;
  %fin : ;
  %next : ;

  %put;                 /* put an empty line into log */

%mend;                  /* folk, that is all for now  */



                        /* needed OPTIONS and GOPTIONS statements */
options nonotes nomprint nomlogic nosymbolgen nomacrogen ps=2000;

goptions reset=symbol colors=(black blue red green white)
                      htitle=6 htext=3 gunit=pct border;

* Example;

    data eins;          /* create origin sample  */
         keep weight;
         do i=1 to 1700;
            weight = 40 + (80-40)*ranuni(0);
            output;
         end;
    run;

      *******************************************************;
      * macro parameters:                                    ;
      *                       sample size                    ;
      *   data,variable,alpha,min,max,step,test distribution ;
      *    |     |      |      |   |  |    |                 ;
%do_itALL(eins,weight  ,3    , 12,182,40  ,normal);
%do_itALL(eins,weight  ,3    , 15, 85,20  ,chi square);  * use >>not<< chi-square ;
%do_itALL(eins,weight  ,3    , 15, 85,20  ,student);
%do_itALL(eins,weight  ,3    , 15, 85,20  ,fisher);
      *                                                      ;
      *   data,result,variable,sample size,alpha,test distribution ;
      *     |    |       |       |           |     |         ;
%interval(eins,mdata99t,weight  ,99         ,4    ,t);
%interval(eins,mdata45n,weight  ,45         ,5    ,normal);
%do_itALL(showgraph);
      *                                                      ;
      *                                    End of Example    ;
      *******************************************************;
options notes source;
*
;