/*------------------------------------------------------------------- */
 /*    Analyzing Receiver Operating Characteristic Curves with SAS    */
 /*                   by Mithat Gonen                                 */
 /*       Copyright(c) 2007 by SAS Institute Inc., Cary, NC, USA      */
 /*                                                                   */
 /*                        ISBN 978-1-59994-298-8                     */
 /*-------------------------------------------------------------------*/
 /*                                                                   */
 /* This material is provided "as is" by SAS Institute Inc.  There    */
 /* are no warranties, expressed or implied, as to merchantability or */
 /* fitness for a particular purpose regarding the materials or code  */
 /* contained herein. The Institute is not responsible for errors     */
 /* in this material as it now exists or will exist, nor does the     */
 /* Institute provide technical support for it.                       */
 /*                                                                   */
 /*-------------------------------------------------------------------*/
 /* Questions or problem reports concerning this material may be      */
 /* addressed to the author:                                          */
 /*                                                                   */
 /* SAS Institute Inc.                                                */
 /* SAS Press                                                         */
 /* Attn: Mithat Gonen                                                */
 /* SAS Campus Drive                                                  */
 /* Cary, NC   27513                                                  */
 /*                                                                   */
 /*                                                                   */
 /* If you prefer, you can send email to:  saspress@sas.com           */
 /* Use this for subject field:                                       */
 /*     Comments for Mithat Gonen                                     */
 /*                                                                   */
 /*-------------------------------------------------------------------*/


******************************************************************;
/*  ROCPLOT macro                                                */
/*  Author: Mithat Gonen                                         */
/*                                                               */
/*                                                               */
/*  Computes and plots the ROC curve                             */
/*  Contains enhancements to the OUTROC option in PROC LOGISTIC	 */
/*                                                               */
/*  INPUTS                                                       */
/*                                                               */
/*  dsn:    data set name                                        */
/*  marker: name of the marker or predictor variable for which   */
/*          an ROC curve is desired. must be numerical           */
/*          two variable names can be given separated by a blank */
/*          if an overlaid plot of two markers are desired       */
/*	gold:   variable for gold standard or true state of nature   */
/*          must be numerical and takes on values of 0 or 1      */
/*          only                                                 */
/*  anno:   controls the amount of annotation                    */
/*          0:  no graphical output, but prints the coordinates  */
/*              of the ROC curve along with the thresholds that  */
/*              corresponds to each of the (TPR,FPR) pairs.      */
/*              useful for determining the thresholds to be      */
/*              annotated (see anno=2 and anno=3 below)          */
/*          1:  plots only the ROC curve                         */
/*          2:  adds the 45-degree line                          */
/*          3:  marks the ROC curve at thresholds given by TLIST */	
/*              (see below)                                      */
/*          4:  adds the horizontal and vertical reference line  */
/*              for the thresholds                               */
/*  tlist:  a list of numerical values separated by blanks       */
/*          the curve will be marked at the points that          */
/*          corresponds to these thresholds. if an overlaid plot */
/*          is desired, this is taken to be the list for the     */
/*          first marker                                         */
/*  tlist2: list of mark points for second marker (see MARKER    */
/*          and TLIST). ignored unless an overlaid plot is       */
/*          desired                                              */
/*  round:  nearest rounding unit if the values in TLIST are to  */
/*          be rounded to match the observations in the data set */
/*          if an overlaid plot is desired this is taken to be   */
/*          the rounding unit for the first marker. Same format  */
/*          as in ROUND function                                 */
/*  round2: rounding unit for the second marker (see MARKER and  */
/*          ROUND). ignored unless an overlaid plot is desired   */ 
/*  optimal:if 1 a list of the distances from the perfect and    */
/*          non-informative markers are printed. often used to   */
/*          select an optimal threshold. ignored for overlaid    */
/*          plots.                                               */
/*                                                               */
******************************************************************;


%macro rocplot(dsn,marker,gold,anno=,tlist=,tlist2=,round=0,round2=0,optimal=0);


data _null_;
  _tmp="&marker";
  call symput("_multiple",countc(_tmp,' '));
run;

%if &_multiple=0 %then %do;

proc sort data=&dsn out=_plrc(rename=(&marker=_x_ &gold=_g) keep=&marker &gold);
  by &marker &gold;
run;

proc sql noprint;
  select sum(_g), n(_g) into :npos, :ntot from _plrc ;
quit;

data _plroc;
  set _plrc;
  retain tn fn 0;
  %if &round=0 %then _x=_x_;
  %else _x=round(_x_,&round);;
  if _g=0 then tn=tn+1;
  if _g=1 then fn=fn+1;
  tnr=tn/(&ntot-&npos);
  fnr=fn/&npos;
  fpr=1-tnr;
  tpr=1-fnr;
run;

data _dummy;
  _x=.;fpr=1;tpr=1;
run;

data _plroc_;
  set _plroc _dummy;
  by _x;
  if last._x;
  dist_to_perfect=sqrt(fpr**2+(1-tpr)**2);
  dist_to_noninf=abs(tpr-fpr);
  keep _x fpr tpr dist_to_perfect dist_to_noninf;
  label _x='Threshold' fpr='False Positive Rate' tpr='True Positive Rate'
        dist_to_perfect='Distance from Perfect Marker' dist_to_noninf='Distance from Non-Informative Marker';
run;

%if &optimal=1 %then %do;
proc sort data=_plroc_;
  by dist_to_perfect dist_to_noninf;
run;
proc print data=_plroc_ noobs label;
  var _x fpr tpr dist_to_perfect dist_to_noninf;
run;
%end;
%if &anno=0 %then %do;
proc print data=_plroc_ noobs label;
  var _x fpr tpr dist_to_perfect dist_to_noninf;
run;
%end;

%else %do;

%annomac;

%if &anno=3 or &anno=4 %then %do;
data anno2;
  length function $ 5 text $ 60 style $ 8 color $ 32;
  set _plroc_(where=(_x in (&tlist)));
  xsys='2' ; ysys='2' ;
  function='label' ; y=tpr ; x=fpr;
  style='special' ; text='M' ; position='5' ; size=2 ; color='red'; output ;
  function='label' ; y=tpr ; x=fpr;
  style='swissb'; text=_x; position='1'; size=1.5; color='red'; output; 
  %line(0,0,1,1,black,3,2);
  %if &anno=4 %then %do;
    %line(fpr,0,fpr,tpr,black,3,1);
    %line(0,tpr,fpr,tpr,black,3,1);
  %end;
  %else;
run;
%end;
%else %if &anno=2 %then %do;
data anno2;
  xsys='2' ; ysys='2' ;
  %line(0,0,1,1,black,3,2);
run;
%end;

axis1 length=8cm order=0 to 1 by 0.2 label=(f=swissb h=2) value=(font=swissb h=2);
axis2 length=8cm order=0 to 1 by 0.2 label=(a=90 f=swissb h=2) value=(font=swissb h=2);

symbol1 v=none i=join w=3;

proc gplot data=_plroc_;
  plot tpr*fpr / haxis=axis1 vaxis=axis2 
		%if &anno^=1 %then annotate=anno2;;
run;quit;

%end;
%end;



%if &_multiple=1 %then %do;


%let marker1=%scan(&marker,1);
%let marker2=%scan(&marker,2);

proc sort data=&dsn out=_plrc1(rename=(&marker1=_x_ &gold=_g) keep=&marker1 &gold);
  by &marker1 &gold;
run;

proc sql noprint;
  select sum(_g), n(_g) into :npos, :ntot from _plrc1 ;
quit;

data _plroc1;
  set _plrc1;
  retain tn fn 0;
  %if &round=0 %then _x=_x_;
  %else _x=round(_x_,&round);;
  if _g=0 then tn=tn+1;
  if _g=1 then fn=fn+1;
  tnr=tn/(&ntot-&npos);
  fnr=fn/&npos;
  fpr=1-tnr;
  tpr=1-fnr;
run;

data _dummy;
  _x=.;fpr=1;tpr=1;
run;

data _plroc1_;
  set _plroc1 _dummy;
  by _x;
  if last._x;
  dist_to_perfect=sqrt(fpr**2+(1-tpr)**2);
  dist_to_noninf=abs(tpr-fpr);
  keep _x fpr tpr dist_to_perfect dist_to_noninf;
  label _x='Threshold' fpr='False Positive Rate' tpr='True Positive Rate'
        dist_to_perfect='Distance from Perfect Marker' dist_to_noninf='Distance from Non-Informative Marker';
run;


proc sort data=&dsn out=_plrc2(rename=(&marker2=_x_ &gold=_g) keep=&marker2 &gold);
  by &marker2 &gold;
run;

proc sql noprint;
  select sum(_g), n(_g) into :npos, :ntot from _plrc2 ;
quit;

data _plroc2;
  set _plrc2;
  retain tn fn 0;
  %if &round2=0 %then _x=_x_;
  %else _x=round(_x_,&round2);;
  if _g=0 then tn=tn+1;
  if _g=1 then fn=fn+1;
  tnr=tn/(&ntot-&npos);
  fnr=fn/&npos;
  fpr=1-tnr;
  tpr=1-fnr;
run;

data _dummy;
  _x=.;fpr=1;tpr=1;
run;

data _plroc2_;
  set _plroc2 _dummy;
  by _x;
  if last._x;
  dist_to_perfect=sqrt(fpr**2+(1-tpr)**2);
  dist_to_noninf=abs(tpr-fpr);
  keep _x fpr tpr dist_to_perfect dist_to_noninf;
  label _x='Threshold' fpr='False Positive Rate' tpr='True Positive Rate'
        dist_to_perfect='Distance from Perfect Marker' dist_to_noninf='Distance from Non-Informative Marker';
run;

data _plroc_;
  set _plroc1_(in=in1) _plroc2_(in=in2);
  if in1 then _marker="&marker1";
  else if in2 then _marker="&marker2";
run;


%if &anno=0 %then %do;
proc print data=_plroc_ noobs label;
  var _x fpr tpr dist_to_perfect dist_to_noninf;
run;
%end;

%else %do;

%annomac;

%if &anno=3 or &anno=4 %then %do;
data anno2;
  length function $ 5 text $ 60 style $ 8 color $ 32;
  set _plroc_(where=((_x in (&tlist) and _marker="&marker1") or (_x in (&tlist2) and _marker="&marker2")));
  xsys='2' ; ysys='2' ;
  function='label' ; y=tpr ; x=fpr;
  style='special' ; text='M' ; position='5' ; size=2 ; 
  if _marker="&marker1" then color='red'; else if _marker="&marker2" then color='blue'; output ;
  function='label' ; y=tpr ; x=fpr;
  style='swissb'; text=_x; position='1'; size=1.5; 
  if _marker="&marker1" then color='red'; else if _marker="&marker2" then color='blue'; output; 
  %line(0,0,1,1,black,3,2);
  %if &anno=4 %then %do;
    %line(fpr,0,fpr,tpr,black,3,1);
    %line(0,tpr,fpr,tpr,black,3,1);
  %end;
  %else;
run;
%end;
%else %if &anno=2 %then %do;
data anno2;
  xsys='2' ; ysys='2' ;
  %line(0,0,1,1,black,3,2);
run;
%end;

axis1 length=8cm order=0 to 1 by 0.2 label=(f=swissb h=2) value=(font=swissb h=2);
axis2 length=8cm order=0 to 1 by 0.2 label=(a=90 f=swissb h=2) value=(font=swissb h=2);

symbol1 v=none c=black i=join w=3 l=1;
symbol2 v=none c=black i=join w=3 l=3;

legend1 position=(inside right bottom) label=none value=(font=swissb) frame across=1;

proc gplot data=_plroc_;
  plot tpr*fpr=_marker / haxis=axis1 vaxis=axis2 legend=legend1
		%if &anno^=1 %then annotate=anno2;;
run;quit;

%end;


%end;


%mend;



******************************************************************;
/*  BOOT1UAC macro                                               */
/*  Author: Mithat Gonen                                         */
/*                                                               */
/*                                                               */
/*  Computes the bootstrap distribution for the AUC              */
/*  and reports the point estimate, standard error and           */
/*  the 95% confidence interval                                  */
/*                                                               */
/*  The distribution itself is available in the data set         */
/*  BOOTDIST                                                     */
/*                                                               */
/*  Missing Data: It is safer to remove the records with         */
/*                missing values of either &marker or &gold      */
/*                to avoid potential imbalances during           */
/*                resampling                                     */
/*                                                               */
/*                                                               */
/*                                                               */
/*  INPUTS                                                       */
/*                                                               */
/*  dsn:    data set name                                        */
/*  marker: name of the marker or predictor variable for which   */
/*          an ROC curve is desired. must be numerical           */
/*  gold:   variable for gold standard or true state of nature   */
/*          must be numerical and takes on values of 0 or 1      */
/*          only                                                 */
/*  boot:   number of bootstrap samples. 1000 by default         */
/*  alpha:  Error level for the confidence interval              */
/*                                                               */
******************************************************************;


%macro boot1auc(dsn,marker,gold,boot=1000,alpha=0.05);

%let conflev=%sysevalf(100*(1-&alpha));
%let ll=%sysevalf((&alpha/2)*&boot);
%let ul=%sysevalf((1-(&alpha)/2)*&boot);

proc datasets;delete bootsample bootdist;

proc sql noprint;
  select n(&marker) into :n
  from &dsn;
quit;

proc surveyselect data=&dsn method=urs n=&n out=bootsample outhits rep=&boot noprint;
run;

filename junk 'junk.txt';
proc printto print=junk;run;

proc logistic data=bootsample;
  by replicate;
  model &gold=&marker;
  ods output association=assoc;
run;

proc printto;run;
data bootdist(keep=nvalue2 rename=(nvalue2=auc));
  set assoc(where=(label2='c'));
run;
options formdlim=' ';
title "Bootstrap Analysis of the AUC for &marker";

proc sql;
  select mean(auc) as AUC, std(auc) as StdErr
  from bootdist;
run;

proc sort data=bootdist;by auc;run;
title "&conflev.% Confidence Interval";

proc sql;
  select a.auc as LowerLimit, b.auc as UpperLimit
  from bootdist(firstobs=&ll obs=&ll) a, bootdist(firstobs=&ul obs=&ul) b; 
quit;
options formdlim='';
%mend;


******************************************************************;
/*  BOOT2UAC macro                                               */
/*  Author: Mithat Gonen                                         */
/*                                                               */
/*                                                               */
/*  Computes the bootstrap distribution for the AUC              */
/*  and reports the point estimate, standard error and           */
/*  the 95% confidence interval                                  */
/*                                                               */
/*  The distribution itself is available in the data set         */
/*  BOOTDIST                                                     */
/*                                                               */
/*  Missing Data: It is safer to remove the records with         */
/*                missing values of either &marker1 or &marker2  */ 
/*                or &gold to avoid potential imbalances during  */
/*                resampling                                     */
/*                                                               */
/*                                                               */
/*                                                               */
/*                                                               */
/*  INPUTS                                                       */
/*                                                               */
/*  dsn:    data set name                                        */
/*  marker1,marker2:                                             */  
/*          names of the markers or predictor variables for which*/
/*          ROC curves are desired. must be numerical            */
/*	gold:   variable for gold standard or true state of nature   */
/*          must be numerical and takes on values of 0 or 1      */
/*          only                                                 */
/*  boot:   number of bootstrap samples. 1000 by default         */
/*  alpha:  Error level for the confidence interval              */
/*  null:   The null value for difference AUCs                   */
/*                                                               */
******************************************************************;


%macro boot2auc(dsn,marker1,marker2,gold,boot=1000,alpha=0.05,null=0);
options formdlim=' ';
%let conflev=%sysevalf(100*(1-&alpha));
%let ll=%sysevalf((&alpha/2)*&boot);
%let ul=%sysevalf((1-(&alpha)/2)*&boot);

proc datasets;delete bootsample bootdist;

proc sql noprint;
  select n(&gold) into :n
  from &dsn;
quit;

proc surveyselect data=&dsn method=urs n=&n out=bootsample outhits rep=&boot noprint;
run;

filename junk 'junk.txt';
proc printto print=junk;run;

proc logistic data=bootsample;
  by replicate;
  model &gold=&marker1;
  ods output association=assoc1;
run;

proc logistic data=bootsample;
  by replicate;
  model &gold=&marker2;
  ods output association=assoc2;
run;

proc printto;run;

data assoc;
  merge assoc1(where=(label2='c') rename=(nvalue2=auc1)) assoc2(where=(label2='c') rename=(nvalue2=auc2));
  by replicate;
  diff=auc1-auc2;
run;

proc sort data=assoc;by diff;run;

proc sql;
  select mean(auc1) as AUC1, std(auc2) as SE, mean(auc2) as AUC2, std(auc2) as SE, 
  mean(diff) as Difference, std(diff) as SE from assoc;
quit;
title "&conflev.% Confidence Interval for the Difference";
proc sql;
  select a.diff as LowerLimit, b.diff as UpperLimit
  from assoc(firstobs=&ll obs=&ll) a, assoc(firstobs=&ul obs=&ul) b; 
quit;
title "H0:Difference=&null";
proc sql;
  select 2*min(n(diff),&boot-n(diff))/&boot as p from assoc where diff>&null;
quit;
%mend;


******************************************************************;
/*  XVAL macro                                                   */
/*  Author: Mithat Gonen                                         */
/*                                                               */
/*                                                               */
/*  Generates a data set that can be used for k-fold             */
/*  cross-validation. The only output is a data set that         */
/*  be used as input to %ROC and %ROCPLOT                        */
/*                                                               */
/*                                                               */
/*  INPUTS                                                       */
/*                                                               */
/*  dsn:    data set name                                        */
/*  outcome:independent variable                                 */
/*  covars: list of dependent variables separated by blanks      */
/*  k:      Fold level of cross validation                       */
/*  sel:    Selection method for logistic regression             */
/*  outdsn: Output data set that contains cross-validated        */
/*          predicted probabilities. Default is _XVAL_           */
/*                                                               */
******************************************************************;


%macro xval(dsn=,outcome=,covars=,k=10,sel=stepwise,outdsn=_xval_);

data _modif;
  set &dsn;
  unif=&k*ranuni(20052905);
  xv=ceil(unif);
run;
%do i=1 %to &k;
  proc logistic data=_modif(where=(xv ne &i)) outmodel=_mod&i noprint;
    model &outcome=&covars / selection=&sel;
  run;
%if print^=0 %then %do;proc printto file='junk.txt';%end;
  proc logistic inmodel=_mod&i;
    score data=_modif(where=(xv=&i)) out=out&i;
  run;
%if print^=0 %then %do;proc printto;run;%end;
%end;
  data &outdsn;
    set %do j=1 %to &k;out&j %end;;
  run;

%mend;



******************************************************************;
/*  BVAL macro                                                   */
/*  Author: Mithat Gonen                                         */
/*                                                               */
/*                                                               */
/*  Performs bootstrap validation                                */
/*                                                               */
/*  INPUTS                                                       */
/*                                                               */
/*  dsn:    data set name                                        */
/*  outcome:independent variable                                 */
/*  covars: list of dependent variables separated by blanks      */
/*  B:      Number of bootstrap samples                          */
/*  sel:    Selection method for logistic regression             */
/*                                                               */
******************************************************************;

%macro bval(dsn=,outcome=,covars=,B=10,sel=stepwise);
proc sql noprint;
  select n(&outcome) into:_n from &dsn;
run;
proc surveyselect data=liver method=urs outhits rep=&B n=&_n out=bsamples noprint;
run;
%do i=1 %to &B;
  proc logistic data=bsamples(where=(replicate=&i)) outmodel=_mod&i noprint;
    model &outcome=&covars / selection=&sel;
  run;
  proc printto file='junk.txt';
  proc logistic inmodel=_mod&i;
    score data=&dsn out=out1&i;
  run;
  proc logistic inmodel=_mod&i;
    score data=bsamples(where=(replicate=&i)) out=out2&i;
  run;
  proc printto;run;  
%end;
  data bval1;
    set %do j=1 %to &B;out1&j(in=in&j) %end;;
	%do j=1 %to &B; if in&j then bsamp=&j; %end;
  run;
  data bval2;
    set %do j=1 %to &B;out2&j(in=in&j) %end;;
	%do j=1 %to &B; if in&j then bsamp=&j; %end;
  run;


proc printto file='junk.txt' new;
proc logistic data=bval1;
  by bsamp;
  model &outcome=p_1;
  ods output association=assoc1;
run;

proc logistic data=bval2;
  by bsamp;
  model &outcome=p_1;
  ods output association=assoc2;
run;
proc logistic data=&dsn;
  model &outcome=&covars / selection=&sel;
  ods output association=assoc3;
run;
proc printto;

data assoc3;
  set assoc3;
  bsamp=1;
run;

data optim;
  merge assoc1(where=(label2='c') keep=bsamp label2 nvalue2 rename=(nvalue2=auc1))
        assoc2(where=(label2='c') keep=bsamp label2 nvalue2 rename=(nvalue2=auc2))
        assoc3(where=(label2='c') keep=bsamp label2 nvalue2 rename=(nvalue2=auc3));
  by bsamp;
run;

proc sql;
  select mean(auc3) as OptimisticAUC, mean(auc2-auc1) as OptimisimCorrection, 
		 mean(auc3)-mean(auc2-auc1) as CorrectedAUC from optim;
quit;


%mend;