/*******************************************************************
%qms2 0.11

Copyright (c) 2000 & 2001 Jeff Lessem, Institute for Behavioral Genetics,
    University of Colorado; and Stacey Cherny, Wellcome Trust Centre
    for Human Genetics, University of Oxford

See http://ibgwww.Colorado.EDU/~lessem/software/qms2.html#license for
the terms under which this software may be used.

Perform a DeFries-Fulker or Haseman-Elston regression on the data
imported by %ghimport, or a compatible script.

Usage:

    data= The dataset to read records for analysis from.  The dataset
          produced by %ghimport meets these requirements, but any user
          supplied dataset in the same arrangement would also work.
          The dataset should have the following variables:
            POS= The position on the chromosome in cM
            Pedigree= A unique number used to identify a family
            Indnum1= A unique number used to identify the first
                   sibling from a family in a pairing
            Indnum2= A unique number used to identify the second
                   sibling from a family in a pairing
            IBD0= The probability of indnum1 and indnum2 being IBD 0
                  at POS
            IBD1= The probability of indnum1 and indnum2 being IBD 1
                  at POS
            IBD2= The probability of indnum1 and indnum2 being IBD 2
                  at POS
            SEX1= Sex of indnum1 (typically 1 or 2, but this macro makes
                no assumption about the meaning of the values)
            SEX2= Sex of indnum2
            Proband1= Proband (or affection) status of indnum1, 1 is
                    not affected and 2 is affected (proband)
            Proband2= Proband (or affection) status of indnum2
            Phen1= The phenotypic score of indnum1
            Phen2= The phenotypic score of indnum1
            DF= The number of unique observations provided by this
              record (either 1 if this is the first appearance of this
              sib pairing, or 0 if this is a subsequent, double entered,
              appearance of the pairing)
            DBL= Designates the current record as being the first
               pairing of the two individuals (0) or the second,
               double-entered pairing (1)
            Pihat= .5 * ibd1 + ibd2
          OPTIONAL, DEFAULT=ibdphen

    output= The dataset to save the output statistics to.  This dataset
            contains the following variables:
            POS= position on the chromosome in cM
            N= The total number of records used in the analyses
              (includes double-entered pairings)
            Real_N= N after correcting for double entry
            ModelDF= Model degrees of freedom, which is the number of
              independent variables.
            Intercep= Mean of the phenotype for ibd 0 (DF model)
              Mean of sibpair difference for ibd 0 (HE model)
            Phen1= The regression beta for the phenotype (DF model)
            b_Aug= The regression beta for the interaction term (DF
              augmented model B5)
            Shapiro= The Shapiro-Wilk W statistic, a measure of
              normality, for the residuals from the regression model
            ShapiroP= The probability that the residuals depart from
              normality
            T= t score, adjusted for double entry
            P= p value, adjusted for double entry
            RMSE= root mean squared error
            B_Pihat= Beta coefficient of pihat (DF model B2 or the HE
                regression coefficient)
            Model= Which model used, either Haseman-Elston,
              DeFries-Fulker, or DF Augmented
            OPTIONAL, DEFAULT=stats

    infce= Output statistics detailing how much influence each
           family has on the regression, 1=yes, 0=no
           OPTIONAL, DEFAULT=0 (no)
    
    infout= The dataset to save the influence statistics to.  A
        complete explanation of the influence statistics can be found
        in the PROC REG documentation for SAS Software.  The following
        variables are saved:  OPTIONAL, DEFAULT=infce
            POS= position on the chromosome in cM
            PEDIGREE= Pedigree number
            INDNUM1= ID number of the proband
            INDNUM2= ID number of the co-sib
            IBD0= Probability that the pair shares no alleles IBD at
                the locus
            IBD1= Probability that the pair shares one allele IBD at
                the locus
            IBD2= Probability that the pair shares both alleles IBD at
                the locus
            SEX1= Sex of the proband
            PROBAND1= 2 indicates individual number 1 is a proband,
                this should always be the case in a selected sample
            PHEN1= Phenotypic score of the proband
            SEX2= Sex of the co-sib
            PROBAND2= 2 indicates that the co-sib is also a proband, a
                1 indicates the co-sib is not a proband
            PHEN2= Phenotypic score of the co-sib
            DF= The number of unique statistics provided by this entry
                (1 or 0)
            DBL= A 1 indicates this is a double entered pairing
            PIHAT= The pihat of the pairing
            AUGMENT= The interaction term of the pair in the DF
                augmented model
            H= Belsley, Kuh, and Welsch (1980) hat matrix statistic
            RSTUDENT= The studentized residual of the observation.
                 Values with an absolute value greater than 2 may
                 warrant investigation.
            DFFITS= A scaled measure of the change in the predicted
                 value for the observation.  Values of DFFITS over 2
                 may warrant investigation.
            MODEL= Which model the results came from
    
    plot= Produce a plot of the results, 1=yes, 0=no
          OPTIONAL, DEFAULT=0 (no)

    vars= Variables to plot, in the form "Y-Axis*X-Axis".  Useful
         examples are "t*pos" or "p*pos"
         OPTIONAL, DEFAULT="t*pos" (t-score by position)

    format= Output format of the plot.  OPTIONAL, DEFAULT="pslepsfc"
          (full color encapsulated postscript) see the SAS Software
          manual for more options

    axis1= The title for axis1, the vertical axis
          OPTIONAL, DEFAULT=t Score

    axis2= The title for axis2, the horizontal axis
          OPTIONAL, DEFAULT=Position on chromosome (cM)

    tail= Set which tail of the distribution is of interest for the
          one-tailed DF basic test.  The p-values for the
          uninteresting tail are all set to 1.  The high tail is
          interesting if the probands were selected from the high end
          of the distribution, and vice-versa for "low".  Any
          transformations you have performed on the data may effect
          which tail is interesting. 
          OPTIONAL, DEFAULT=do nothing, use "high" or "low"
          to set the tail of interest.  Any value other than "high" or
          "low" leaves the p-values unchanged.

The plot files are named "<model>.eps" with the name of the model in
    the filename.  The graphs are titled "Plot by <model> method".  At
    this time the only way to change the filenames or graph titles is
    to edit the macro at the lines "filename t..." and "title1
    ..."  In both of these places, the term "&model" is replaced by
    the name of the model, so it is strongly suggested that &model be
    preserved in the filename so that the plots from a single run
    do not overwrite each other.
    
********************************************************************/    
%macro qms2(data=ibdphen,
    output=stats,
    infce=0,
    infout=infce,
    plot=0,
    vars=t*pos,
    format=pslepsfc,
    axis1=t-score,
    axis2=Position on chromosome (cM),
    tail=nothing,
    spr=compute);

    
/* create the dataset to contain the results if it does not already
    exist */
%if %sysfunc(exist(&output))=0 %then %do;
    data &output;
    run;
%end; /* existence check */
    
/* create the dataset to contain the influence statistics if it does
    not already exist */
%if %sysfunc(exist(&infout))=0 %then %do;
    data &infout;
    run;
%end; /* existence check */
    
/* This macro handles the data produced by the regressions */
/* It is inefficient to define one macro within another, but it works */
%macro output(model);

proc univariate normal data=resid noprint;
    output out=compr normal=shapiro probn=shapirop;
    var compr;
    by pos;

/* get the proper degrees of freedom */
proc means data=&data noprint;
    var df;
    by pos;
    output out=df mean=m n=n;
data modeldf;
    set results (keep=_in_);
    if _in_ ne .;
    dummy=0;
run;
data df;
    set df;
    /* real_n is the number of unique statistics */
    real_n=m*n;
    /* Hasemant-Elston is never double entered */
    %if "&model"="Haseman-Elston" | "&model"="Sham-Purcell" %then n=real_n;;
    dummy=0;
    drop _type_ _freq_ m;
run;
/* the model degrees of freedom is the number of independent variables */
data df;
    merge df modeldf (rename=(_IN_=modeldf));
    by dummy;
    drop dummy;
run;

/* compute fit statistics based on the corrected degrees of freedom */
data fits;
    merge df results compr;
    by pos;
    if _type_="T" then do;
    %if "&model"="DF_Augmented" %then %do;
        t=augment*sqrt(real_n/n);
        %end;
    %else %do;
        t=pihat*sqrt(real_n/n);
        %end;
    p=(1-probf(t**2,1,real_n-modeldf-1))/2;
    t2=t**2;
    /* set p to 1 for the wrong tail */
    %if "&model"="DeFries-Fulker" %then %do;
        %if "&tail"="high" %then %do;
            if t lt 0 then p=1;
            %end;
        %else %if "&tail"="low" %then %do;
            if t gt 0 then p=1;
            %end;
        %end;
    %else %if "&model"="Haseman-Elston" %then %do;
        if t gt 0 then p=1;
        %end;
    %else %do; /* New-He DF_Augmented and Sham-Purcell */
        if t lt 0 then p=1;
        %end;
    end;
    if _type_="T";
    drop _model_ _depvar_ _rmse_ pihat comp _in_ _p_ _edf_ _rsq_ phen2 depvar
        %if "&model"="DF_Augmented" %then augment;
run;

/* get a datafile with the fit statistics and betas */
data parms;
    set results (keep=_type_ pos _rmse_ pihat
        %if "&model"="DF_Augmented" %then augment;
    );
    if _type_="PARMS";
run;
data tmprslt;
    merge fits (drop=_type_)
        parms (rename=( _rmse_=rmse pihat=b_pihat
        %if "&model"="DF_Augmented" %then augment=b_aug;
    ) drop=_type_ );
    model="&model";
run;

/* merge the results with the output dataset */
data &output;
    set &output tmprslt;
    /* drop the blank line created in the empty dataset */
    if N ne .;
    run;

/* handle the influence statistics, if requested */
%if &infce=1 %then %do;
data tmpinf;
    set tmpinf;
    model="&model";
run;
data &infout;
    set &infout tmpinf;
    /* drop the blank line created in the empty dataset */
    if pos ne .;
run;
%end; /* influence statistics */
    
%if &plot=1 %then %do;
/* graph the results */

/* filename to save graph under */
filename graph "&model..eps";

/* Graphing options */
goptions
    /* reset all options to default */
    reset=all
    /* select the font (helvetica) */
    ftext=hwpsl009
    /* full color encapsilated postscript output */
    device=&format
    /* replace the output file if it already exists */
    gsfmode=replace
    /* make the graph print in landscape(!) */
    rotate=portrait
    /* select the size of the graph.  It is eps so it will scale */
    vsize=3.5in hsize=6.0in
     /* fileref to use for graph */
    gsfname=graph;

/* set the graph main title */
title1 "Plot by &model method";

/* add axis labels */
axis1 label=(
    /* make the text vertical */
    angle=90
    /* center justify */
    justify=c
    "&axis1");
/* define another axis */
axis2 label=(justify=c "&axis2");

proc gplot data=tmprslt;
    symbol
    /* method of connecting points, use join for straight lines */
    interpol=join
    /* make the line a bit thicker */
    width=3;        
    plot &vars /
        /* add vertical and horizontal reference lines */
        grid
        /* use the axis statements set earlier */
        vaxis=axis1 haxis=axis2;
%end; /* end plotting */

%mend output;

/* Haseman-Elston Regression */
data he;
    set &data;
    /* drop double entered records because twin ordering is irrelevant
        for HE */
    if df=1;
    /* create the dependent variable, squared sib difference */
    comp=(phen1 - phen2)**2;
run;

proc reg data=he tableout edf outest=results noprint;
    model comp = pihat %if &infce=1 %then / influence ;;
    output out=resid r=compr;
    %if &infce=1 %then
    output out=tmpinf dffits=dffits h=h rstudent=rstudent;;
    by pos;

%output(Haseman-Elston);
    
/* New Haseman-Elston Regression */
/* Get the means of the phenotypes */
proc means data=&data noprint;
    var phen1 phen2;
    output out=means mean=mean1 mean2;
data means;
    set means;
    dummy=1;
run;
data nhe;
    set &data;
    dummy=1;
run;

data nhe;
    merge nhe means(drop=_type_ _freq_);
    by dummy;
    /* create the dependent variable */
    comp=(phen1 - mean1)*(phen2 - mean2);
    drop=dummy;
run;

proc reg data=nhe tableout edf outest=results noprint;
    model comp = pihat %if &infce=1 %then / influence ;;
    output out=resid r=compr;
    %if &infce=1 %then
    output out=tmpinf dffits=dffits h=h rstudent=rstudent;;
    by pos;

%output(New-HE);
    
/* DeFries-Fulker Basic Regression */
proc reg data=&data tableout edf outest=results noprint;
    model phen2 = phen1 pihat %if &infce=1 %then / influence ;;
    output out=resid r=compr;
    %if &infce=1 %then
    output out=tmpinf dffits=dffits h=h rstudent=rstudent;;
    by pos;
%output(DeFries-Fulker);

/* DeFries-Fulker Augmented Regression */
data dfaug;
    set &data;
    augment=phen1*pihat;
    run;

proc reg data=dfaug tableout edf outest=results noprint;
    model phen2 = phen1 pihat augment %if &infce=1 %then / influence ;;
    output out=resid r=compr;
    %if &infce=1 %then
    output out=tmpinf dffits=dffits h=h rstudent=rstudent;;
    by pos;        
%output(DF_Augmented);

/* Sham-Purcell Regression Approach */
/* This requires a bi-variate normal with mean 0 and SD 1 in the original population */
data sham;
    set &data;
    /* Is this method valid for double entered data? */
    if df=1;
    dummy=1;

/* compute the sib-pair correlation if it isn't provided */
%if "&spr"="compute" %then %do;
/* create the correlation used in the dependent variable */
    proc corr data=sham nosimple noprint outp=shamr;
        var phen1 phen2;

/* get r into a dataset with a dummy variable for merging */    
    data shamr;
        set shamr;
        if _type_="CORR" and _name_="phen1";
        r=phen2;
        dummy=1;
        keep r dummy;
%end;
%else %do;
    data shamr;
        r=&spr;
        dummy=1;
%end;

/* setup the dependent variable */
data sham;
    merge sham shamr;
    by dummy;
    depvar=((phen1+phen2)**2/(1+r)**2) - ((phen1-phen2)**2/(1-r)**2)
        + ( (4*r)/(1-r**2));
    pihat=pihat-.5;
    drop dummy r;

proc reg data=sham tableout edf outest=results noprint;
    model depvar = pihat / noint %if &infce=1 %then influence ;;
    output out=resid r=compr;
    %if &infce=1 %then
    output out=tmpinf dffits=dffits h=h rstudent=rstudent;;
    by pos;
%output(Sham-Purcell);

/* Reset the title to be blank */
title 'QMS2 Output';
footnote1 "Lessem, J. M., and Cherny, S. S. (2001) DeFries-Fulker";
footnote2 "multiple regression analysis of sibship data: a SAS macro.";
footnote3 "Bioinformatics, 17, 371-372.";

%mend qms2;