/*------------------------------------------------------------------- */ /* Analyzing Receiver Operating Characteristic Curves with SAS */ /* by Mithat Gonen */ /* Copyright(c) 2007 by SAS Institute Inc., Cary, NC, USA */ /* */ /* ISBN 978-1-59994-298-8 */ /*-------------------------------------------------------------------*/ /* */ /* This material is provided "as is" by SAS Institute Inc. There */ /* are no warranties, expressed or implied, as to merchantability or */ /* fitness for a particular purpose regarding the materials or code */ /* contained herein. The Institute is not responsible for errors */ /* in this material as it now exists or will exist, nor does the */ /* Institute provide technical support for it. */ /* */ /*-------------------------------------------------------------------*/ /* Questions or problem reports concerning this material may be */ /* addressed to the author: */ /* */ /* SAS Institute Inc. */ /* SAS Press */ /* Attn: Mithat Gonen */ /* SAS Campus Drive */ /* Cary, NC 27513 */ /* */ /* */ /* If you prefer, you can send email to: saspress@sas.com */ /* Use this for subject field: */ /* Comments for Mithat Gonen */ /* */ /*-------------------------------------------------------------------*/ ******************************************************************; /* ROCPLOT macro */ /* Author: Mithat Gonen */ /* */ /* */ /* Computes and plots the ROC curve */ /* Contains enhancements to the OUTROC option in PROC LOGISTIC */ /* */ /* INPUTS */ /* */ /* dsn: data set name */ /* marker: name of the marker or predictor variable for which */ /* an ROC curve is desired. must be numerical */ /* two variable names can be given separated by a blank */ /* if an overlaid plot of two markers are desired */ /* gold: variable for gold standard or true state of nature */ /* must be numerical and takes on values of 0 or 1 */ /* only */ /* anno: controls the amount of annotation */ /* 0: no graphical output, but prints the coordinates */ /* of the ROC curve along with the thresholds that */ /* corresponds to each of the (TPR,FPR) pairs. */ /* useful for determining the thresholds to be */ /* annotated (see anno=2 and anno=3 below) */ /* 1: plots only the ROC curve */ /* 2: adds the 45-degree line */ /* 3: marks the ROC curve at thresholds given by TLIST */ /* (see below) */ /* 4: adds the horizontal and vertical reference line */ /* for the thresholds */ /* tlist: a list of numerical values separated by blanks */ /* the curve will be marked at the points that */ /* corresponds to these thresholds. if an overlaid plot */ /* is desired, this is taken to be the list for the */ /* first marker */ /* tlist2: list of mark points for second marker (see MARKER */ /* and TLIST). ignored unless an overlaid plot is */ /* desired */ /* round: nearest rounding unit if the values in TLIST are to */ /* be rounded to match the observations in the data set */ /* if an overlaid plot is desired this is taken to be */ /* the rounding unit for the first marker. Same format */ /* as in ROUND function */ /* round2: rounding unit for the second marker (see MARKER and */ /* ROUND). ignored unless an overlaid plot is desired */ /* optimal:if 1 a list of the distances from the perfect and */ /* non-informative markers are printed. often used to */ /* select an optimal threshold. ignored for overlaid */ /* plots. */ /* */ ******************************************************************; %macro rocplot(dsn,marker,gold,anno=,tlist=,tlist2=,round=0,round2=0,optimal=0); data _null_; _tmp="&marker"; call symput("_multiple",countc(_tmp,' ')); run; %if &_multiple=0 %then %do; proc sort data=&dsn out=_plrc(rename=(&marker=_x_ &gold=_g) keep=&marker &gold); by &marker &gold; run; proc sql noprint; select sum(_g), n(_g) into :npos, :ntot from _plrc ; quit; data _plroc; set _plrc; retain tn fn 0; %if &round=0 %then _x=_x_; %else _x=round(_x_,&round);; if _g=0 then tn=tn+1; if _g=1 then fn=fn+1; tnr=tn/(&ntot-&npos); fnr=fn/&npos; fpr=1-tnr; tpr=1-fnr; run; data _dummy; _x=.;fpr=1;tpr=1; run; data _plroc_; set _plroc _dummy; by _x; if last._x; dist_to_perfect=sqrt(fpr**2+(1-tpr)**2); dist_to_noninf=abs(tpr-fpr); keep _x fpr tpr dist_to_perfect dist_to_noninf; label _x='Threshold' fpr='False Positive Rate' tpr='True Positive Rate' dist_to_perfect='Distance from Perfect Marker' dist_to_noninf='Distance from Non-Informative Marker'; run; %if &optimal=1 %then %do; proc sort data=_plroc_; by dist_to_perfect dist_to_noninf; run; proc print data=_plroc_ noobs label; var _x fpr tpr dist_to_perfect dist_to_noninf; run; %end; %if &anno=0 %then %do; proc print data=_plroc_ noobs label; var _x fpr tpr dist_to_perfect dist_to_noninf; run; %end; %else %do; %annomac; %if &anno=3 or &anno=4 %then %do; data anno2; length function $ 5 text $ 60 style $ 8 color $ 32; set _plroc_(where=(_x in (&tlist))); xsys='2' ; ysys='2' ; function='label' ; y=tpr ; x=fpr; style='special' ; text='M' ; position='5' ; size=2 ; color='red'; output ; function='label' ; y=tpr ; x=fpr; style='swissb'; text=_x; position='1'; size=1.5; color='red'; output; %line(0,0,1,1,black,3,2); %if &anno=4 %then %do; %line(fpr,0,fpr,tpr,black,3,1); %line(0,tpr,fpr,tpr,black,3,1); %end; %else; run; %end; %else %if &anno=2 %then %do; data anno2; xsys='2' ; ysys='2' ; %line(0,0,1,1,black,3,2); run; %end; axis1 length=8cm order=0 to 1 by 0.2 label=(f=swissb h=2) value=(font=swissb h=2); axis2 length=8cm order=0 to 1 by 0.2 label=(a=90 f=swissb h=2) value=(font=swissb h=2); symbol1 v=none i=join w=3; proc gplot data=_plroc_; plot tpr*fpr / haxis=axis1 vaxis=axis2 %if &anno^=1 %then annotate=anno2;; run;quit; %end; %end; %if &_multiple=1 %then %do; %let marker1=%scan(&marker,1); %let marker2=%scan(&marker,2); proc sort data=&dsn out=_plrc1(rename=(&marker1=_x_ &gold=_g) keep=&marker1 &gold); by &marker1 &gold; run; proc sql noprint; select sum(_g), n(_g) into :npos, :ntot from _plrc1 ; quit; data _plroc1; set _plrc1; retain tn fn 0; %if &round=0 %then _x=_x_; %else _x=round(_x_,&round);; if _g=0 then tn=tn+1; if _g=1 then fn=fn+1; tnr=tn/(&ntot-&npos); fnr=fn/&npos; fpr=1-tnr; tpr=1-fnr; run; data _dummy; _x=.;fpr=1;tpr=1; run; data _plroc1_; set _plroc1 _dummy; by _x; if last._x; dist_to_perfect=sqrt(fpr**2+(1-tpr)**2); dist_to_noninf=abs(tpr-fpr); keep _x fpr tpr dist_to_perfect dist_to_noninf; label _x='Threshold' fpr='False Positive Rate' tpr='True Positive Rate' dist_to_perfect='Distance from Perfect Marker' dist_to_noninf='Distance from Non-Informative Marker'; run; proc sort data=&dsn out=_plrc2(rename=(&marker2=_x_ &gold=_g) keep=&marker2 &gold); by &marker2 &gold; run; proc sql noprint; select sum(_g), n(_g) into :npos, :ntot from _plrc2 ; quit; data _plroc2; set _plrc2; retain tn fn 0; %if &round2=0 %then _x=_x_; %else _x=round(_x_,&round2);; if _g=0 then tn=tn+1; if _g=1 then fn=fn+1; tnr=tn/(&ntot-&npos); fnr=fn/&npos; fpr=1-tnr; tpr=1-fnr; run; data _dummy; _x=.;fpr=1;tpr=1; run; data _plroc2_; set _plroc2 _dummy; by _x; if last._x; dist_to_perfect=sqrt(fpr**2+(1-tpr)**2); dist_to_noninf=abs(tpr-fpr); keep _x fpr tpr dist_to_perfect dist_to_noninf; label _x='Threshold' fpr='False Positive Rate' tpr='True Positive Rate' dist_to_perfect='Distance from Perfect Marker' dist_to_noninf='Distance from Non-Informative Marker'; run; data _plroc_; set _plroc1_(in=in1) _plroc2_(in=in2); if in1 then _marker="&marker1"; else if in2 then _marker="&marker2"; run; %if &anno=0 %then %do; proc print data=_plroc_ noobs label; var _x fpr tpr dist_to_perfect dist_to_noninf; run; %end; %else %do; %annomac; %if &anno=3 or &anno=4 %then %do; data anno2; length function $ 5 text $ 60 style $ 8 color $ 32; set _plroc_(where=((_x in (&tlist) and _marker="&marker1") or (_x in (&tlist2) and _marker="&marker2"))); xsys='2' ; ysys='2' ; function='label' ; y=tpr ; x=fpr; style='special' ; text='M' ; position='5' ; size=2 ; if _marker="&marker1" then color='red'; else if _marker="&marker2" then color='blue'; output ; function='label' ; y=tpr ; x=fpr; style='swissb'; text=_x; position='1'; size=1.5; if _marker="&marker1" then color='red'; else if _marker="&marker2" then color='blue'; output; %line(0,0,1,1,black,3,2); %if &anno=4 %then %do; %line(fpr,0,fpr,tpr,black,3,1); %line(0,tpr,fpr,tpr,black,3,1); %end; %else; run; %end; %else %if &anno=2 %then %do; data anno2; xsys='2' ; ysys='2' ; %line(0,0,1,1,black,3,2); run; %end; axis1 length=8cm order=0 to 1 by 0.2 label=(f=swissb h=2) value=(font=swissb h=2); axis2 length=8cm order=0 to 1 by 0.2 label=(a=90 f=swissb h=2) value=(font=swissb h=2); symbol1 v=none c=black i=join w=3 l=1; symbol2 v=none c=black i=join w=3 l=3; legend1 position=(inside right bottom) label=none value=(font=swissb) frame across=1; proc gplot data=_plroc_; plot tpr*fpr=_marker / haxis=axis1 vaxis=axis2 legend=legend1 %if &anno^=1 %then annotate=anno2;; run;quit; %end; %end; %mend; ******************************************************************; /* BOOT1UAC macro */ /* Author: Mithat Gonen */ /* */ /* */ /* Computes the bootstrap distribution for the AUC */ /* and reports the point estimate, standard error and */ /* the 95% confidence interval */ /* */ /* The distribution itself is available in the data set */ /* BOOTDIST */ /* */ /* Missing Data: It is safer to remove the records with */ /* missing values of either &marker or &gold */ /* to avoid potential imbalances during */ /* resampling */ /* */ /* */ /* */ /* INPUTS */ /* */ /* dsn: data set name */ /* marker: name of the marker or predictor variable for which */ /* an ROC curve is desired. must be numerical */ /* gold: variable for gold standard or true state of nature */ /* must be numerical and takes on values of 0 or 1 */ /* only */ /* boot: number of bootstrap samples. 1000 by default */ /* alpha: Error level for the confidence interval */ /* */ ******************************************************************; %macro boot1auc(dsn,marker,gold,boot=1000,alpha=0.05); %let conflev=%sysevalf(100*(1-&alpha)); %let ll=%sysevalf((&alpha/2)*&boot); %let ul=%sysevalf((1-(&alpha)/2)*&boot); proc datasets;delete bootsample bootdist; proc sql noprint; select n(&marker) into :n from &dsn; quit; proc surveyselect data=&dsn method=urs n=&n out=bootsample outhits rep=&boot noprint; run; filename junk 'junk.txt'; proc printto print=junk;run; proc logistic data=bootsample; by replicate; model &gold=▮ ods output association=assoc; run; proc printto;run; data bootdist(keep=nvalue2 rename=(nvalue2=auc)); set assoc(where=(label2='c')); run; options formdlim=' '; title "Bootstrap Analysis of the AUC for &marker"; proc sql; select mean(auc) as AUC, std(auc) as StdErr from bootdist; run; proc sort data=bootdist;by auc;run; title "&conflev.% Confidence Interval"; proc sql; select a.auc as LowerLimit, b.auc as UpperLimit from bootdist(firstobs=&ll obs=&ll) a, bootdist(firstobs=&ul obs=&ul) b; quit; options formdlim=''; %mend; ******************************************************************; /* BOOT2UAC macro */ /* Author: Mithat Gonen */ /* */ /* */ /* Computes the bootstrap distribution for the AUC */ /* and reports the point estimate, standard error and */ /* the 95% confidence interval */ /* */ /* The distribution itself is available in the data set */ /* BOOTDIST */ /* */ /* Missing Data: It is safer to remove the records with */ /* missing values of either &marker1 or &marker2 */ /* or &gold to avoid potential imbalances during */ /* resampling */ /* */ /* */ /* */ /* */ /* INPUTS */ /* */ /* dsn: data set name */ /* marker1,marker2: */ /* names of the markers or predictor variables for which*/ /* ROC curves are desired. must be numerical */ /* gold: variable for gold standard or true state of nature */ /* must be numerical and takes on values of 0 or 1 */ /* only */ /* boot: number of bootstrap samples. 1000 by default */ /* alpha: Error level for the confidence interval */ /* null: The null value for difference AUCs */ /* */ ******************************************************************; %macro boot2auc(dsn,marker1,marker2,gold,boot=1000,alpha=0.05,null=0); options formdlim=' '; %let conflev=%sysevalf(100*(1-&alpha)); %let ll=%sysevalf((&alpha/2)*&boot); %let ul=%sysevalf((1-(&alpha)/2)*&boot); proc datasets;delete bootsample bootdist; proc sql noprint; select n(&gold) into :n from &dsn; quit; proc surveyselect data=&dsn method=urs n=&n out=bootsample outhits rep=&boot noprint; run; filename junk 'junk.txt'; proc printto print=junk;run; proc logistic data=bootsample; by replicate; model &gold=&marker1; ods output association=assoc1; run; proc logistic data=bootsample; by replicate; model &gold=&marker2; ods output association=assoc2; run; proc printto;run; data assoc; merge assoc1(where=(label2='c') rename=(nvalue2=auc1)) assoc2(where=(label2='c') rename=(nvalue2=auc2)); by replicate; diff=auc1-auc2; run; proc sort data=assoc;by diff;run; proc sql; select mean(auc1) as AUC1, std(auc2) as SE, mean(auc2) as AUC2, std(auc2) as SE, mean(diff) as Difference, std(diff) as SE from assoc; quit; title "&conflev.% Confidence Interval for the Difference"; proc sql; select a.diff as LowerLimit, b.diff as UpperLimit from assoc(firstobs=&ll obs=&ll) a, assoc(firstobs=&ul obs=&ul) b; quit; title "H0:Difference=&null"; proc sql; select 2*min(n(diff),&boot-n(diff))/&boot as p from assoc where diff>&null; quit; %mend; ******************************************************************; /* XVAL macro */ /* Author: Mithat Gonen */ /* */ /* */ /* Generates a data set that can be used for k-fold */ /* cross-validation. The only output is a data set that */ /* be used as input to %ROC and %ROCPLOT */ /* */ /* */ /* INPUTS */ /* */ /* dsn: data set name */ /* outcome:independent variable */ /* covars: list of dependent variables separated by blanks */ /* k: Fold level of cross validation */ /* sel: Selection method for logistic regression */ /* outdsn: Output data set that contains cross-validated */ /* predicted probabilities. Default is _XVAL_ */ /* */ ******************************************************************; %macro xval(dsn=,outcome=,covars=,k=10,sel=stepwise,outdsn=_xval_); data _modif; set &dsn; unif=&k*ranuni(20052905); xv=ceil(unif); run; %do i=1 %to &k; proc logistic data=_modif(where=(xv ne &i)) outmodel=_mod&i noprint; model &outcome=&covars / selection=&sel; run; %if print^=0 %then %do;proc printto file='junk.txt';%end; proc logistic inmodel=_mod&i; score data=_modif(where=(xv=&i)) out=out&i; run; %if print^=0 %then %do;proc printto;run;%end; %end; data &outdsn; set %do j=1 %to &k;out&j %end;; run; %mend; ******************************************************************; /* BVAL macro */ /* Author: Mithat Gonen */ /* */ /* */ /* Performs bootstrap validation */ /* */ /* INPUTS */ /* */ /* dsn: data set name */ /* outcome:independent variable */ /* covars: list of dependent variables separated by blanks */ /* B: Number of bootstrap samples */ /* sel: Selection method for logistic regression */ /* */ ******************************************************************; %macro bval(dsn=,outcome=,covars=,B=10,sel=stepwise); proc sql noprint; select n(&outcome) into:_n from &dsn; run; proc surveyselect data=liver method=urs outhits rep=&B n=&_n out=bsamples noprint; run; %do i=1 %to &B; proc logistic data=bsamples(where=(replicate=&i)) outmodel=_mod&i noprint; model &outcome=&covars / selection=&sel; run; proc printto file='junk.txt'; proc logistic inmodel=_mod&i; score data=&dsn out=out1&i; run; proc logistic inmodel=_mod&i; score data=bsamples(where=(replicate=&i)) out=out2&i; run; proc printto;run; %end; data bval1; set %do j=1 %to &B;out1&j(in=in&j) %end;; %do j=1 %to &B; if in&j then bsamp=&j; %end; run; data bval2; set %do j=1 %to &B;out2&j(in=in&j) %end;; %do j=1 %to &B; if in&j then bsamp=&j; %end; run; proc printto file='junk.txt' new; proc logistic data=bval1; by bsamp; model &outcome=p_1; ods output association=assoc1; run; proc logistic data=bval2; by bsamp; model &outcome=p_1; ods output association=assoc2; run; proc logistic data=&dsn; model &outcome=&covars / selection=&sel; ods output association=assoc3; run; proc printto; data assoc3; set assoc3; bsamp=1; run; data optim; merge assoc1(where=(label2='c') keep=bsamp label2 nvalue2 rename=(nvalue2=auc1)) assoc2(where=(label2='c') keep=bsamp label2 nvalue2 rename=(nvalue2=auc2)) assoc3(where=(label2='c') keep=bsamp label2 nvalue2 rename=(nvalue2=auc3)); by bsamp; run; proc sql; select mean(auc3) as OptimisticAUC, mean(auc2-auc1) as OptimisimCorrection, mean(auc3)-mean(auc2-auc1) as CorrectedAUC from optim; quit; %mend;