# helper functions for the collinearity analysis, written by the authors of the paper
# specific author is given in each function

# Functions for diagnostics of collinearity are collected in "diagnostics.r".
# Functions for Boosted Regression Tree and Multiple Adaptive Regression Splines (MARS) are available on request from Jane Elith (we include a recent version but will not keep it updated): brtfunctions.r and marsHelpers.r
# OSCAR is run in MATLAB. We provide functions to run OSCAR in MATLAB from R (see OSCAR_functions.r).

# alphabetically listed!

# INDEX:

# AICc.r2: calculates an AICc from the R2 of a linear model
# BIC.r2: as AICc.r2, just the BIC
# coef.cpca
# coef.pls
# CollClustReduction
# collineariser
# CPCA.fit
# createLists
# detect
# dr.direct.default.br
# formula.maker: uses colnames of a dataframe to construct a formula; with options for quadratic effects and (first-order) interactions
# functioncutter: transforms the function given in dats into an interpretable expression
# getVIF
# giveWeightsVif
# ivif
# makeFormula
# newPCclusters
# PCAclusteringbackward
# PCAclusteringforward
# pplsoptim
# pred.cpca
# pred.lrr
# pred.pcr
# pred.pls
# pred.ppls
# predict.lasso (also for ridge)
# predict.seqreg
# seqreg
# select07
# step.seqreg  #
# train.cpca
# train.lrr
# train.pcr
# train.pls
# train.ppls
# varcluster


AICc.r2 <- function(R2, n, p){
    # function to determine AICc for normally distributed data from their R2, sample size n and number of model parameters p
    # by Carsten F. Dormann
    (1-R2)*(n-1) + p*2* (p+1)/(n-p-1) # according to Burnham & Anderson
}


BIC.r2 <- function(R2, n, p){
    # function to determine BIC for normally distributed data from their R2, sample size n and number of model parameters p
    # by Carsten F. Dormann
    (1-R2)*(n-1) + p*log(n)
}


coef.cpca  <-  function(object){
  # extract coefficients from CPCA-regression object
  # author: Gudrun Carl
  coefcpca <- object$coef
  coefcpca
}


coef.pls  <-  function(object){
  # extract coefficients from pls-regression object
  # author: Gudrun Carl
  family <- object$family
  comps <- object$comps
  if(family=="gaussian") coefpls <- object$pls$coef[, comps]
  if(family=="binomial") coefpls <- object$pls$coef
  coefpls
}


CollClustReduction <- function(X, y, clusters=list(c()), varselect='centred'){
    # written by Bruno Laforcade and Gabriel Carr, with later modifications by CFD (june 2008)
    require(Hmisc)
    if (!varselect %in% c("centred", "summarised", "explained")) stop("Please select a valid method for selecting variables within clusters: 'centred', 'explained' or 'summarised'.")
	  N <- ncol(X)
	  newvar <- as.data.frame(matrix(nrow=nrow(X), ncol=length(clusters)) )
	  if (varselect =='centred'){ 
    # extract the "central" variable with least distance to all others in the cluster
	 	 for (i in 1:length(clusters)){
	  			if (length(clusters[[i]])==1){
              addvar <- clusters[[i]]
          } else {
  	  			table <- as.matrix(dist(t(X[,unlist(clusters[[i]])])))
  	  			addvar <- names(which.min(rowSums(table)))
          }
  			  newvar[, i] <- X[, addvar] #drop=F retains column names
  			  colnames(newvar)[i] <- addvar
      }
      attr(newvar, "type") <- "centred"
	  }
	  if (varselect=='explained'){  
    # extract the most representative variable in terms of best fit to response variable!
		  for(i in 1:length(clusters)){
		     aic <- sapply(clusters[[i]], function(x) gam(y~s(X[, x], bs="cs"), family= 'gaussian')$aic)
        addvar <- names(which.min(aic))
 	  		newvar[,i] <- X[,addvar]
 			  colnames(newvar)[i] <- addvar
		  }
      attr(newvar, "type") <- "explained"
 	  }
	  
    if (varselect=='summarised'){
    # extract the entire cluster as the first PC
		  for (i in 1:length(clusters)){
		  		#clusters[[i]] <- sort(clusters[[i]])
		  		pca <- prcomp(X[, clusters[[i]]], scale=T)
		  		newvar[,i] <- pca$x[,1]
		  		if (length(clusters[[i]])==1){
            colnames(newvar)[i] <- clusters[[i]]
          } else { 
            colnames(newvar)[i] <- paste("PCcluster", i, sep="")
          }
#		  		X <- cbind(X, g)			#(paste('PC',clusters[[i]][1],sep='.'))
#		  		newvar <- c(newvar, ncol(X))
		  }
      attr(newvar, "type") <- "summarised"
	  }
	  return(cbind.data.frame(y, newvar)) 
}
#CollClustReduction(X=cdata[,-1], y=cdata[,1], clusters=clusters)


# make collinear data from the CreCre-data:
#load("CreCre.Rdata")
#library(mvtnorm)
#names(CreCre)
#co <- cor(CreCre[,-c(1:4, 32, 33, 34)])
#playdata <- list(train=1, test=2, uncortest=3)
#set.seed(101)
#ran <- rmvnorm(1000, mean=colMeans(CreCre[-c(1:4, 32, 33, 34)]), sigma=co)
#colnames(ran) <- paste("X", 1:27, sep="")
#which(co>0.7, arr.ind=T)
#cor(ran[,14:17])
#y = -496 + 0.1*ran[,"X22"] + 0.02*ran[,"X23"]+0.0002*ran[,"X23"]^2+0.001*ran[,"X24"]*ran[,"X25"]
#playdata$train <- cbind(y=y[1:500], ran[1:500,])
#playdata$test <- cbind(y=y[501:1000], ran[501:1000,])
#set.seed(102)
#uncor <- rmvnorm(500, mean=colMeans(CreCre[-c(1:4, 32, 33, 34)]), sigma=diag(1, 27, 27))
#colnames(uncor) <- paste("X", 1:27, sep="")
#y = -496 + 0.1*ran[,"X22"] + 0.02*ran[,"X23"]+0.0002*ran[,"X23"]^2+0.001*ran[,"X24"]*ran[,"X25"]
#y2= -496 + 0.1*uncor[,"X22"] + 0.02*uncor[,"X23"]+0.0002*uncor[,"X23"]^2+0.001*uncor[,"X24"]*uncor[,"X25"]
#playdata$uncortest <- cbind(y=y2, uncor)
#save(playdata, file="playdata.Rdata")
#load("playdata.Rdata")
#hist(abs(cor(playdata$train[,-1])[upper.tri(cor(playdata$train[,-1]))]))


collineariser <- function(k=5, N=1000, decay=0.02, plotit="no", method="normal",
  seed=round(decay*1000), y.noise=0.5, scale=FALSE,
  FUN=function(X) 25+2*X[,1] -1.5*X[,1]^2 -2*X[,21] +0.001*X[,1]*X[,21], ...){
        # Function to generate collinear data with the following properties:
        # * 4 clusters with k variables each
        # * 1 uncorrelated variable
        # * y as a function of Xs
        #
        # k       number of variables for each of the four clusters
        # N       number of data points
        # decay   how fast shall the strength of collinearity decay in the most
        #         highly correlated cluster? good value range is 0.01 to 0.25:
        #     0.01 is extremely high, with no correlation below 0.64!
        #     0.02 min r = 0.5 (in least collinear cluster), max r = 0.99 (in most collinear cluster)
        #     0.1 is good collinearity: max around 0.7 and min (low cluster) of absent
        #     0.25 is pratically very low correlation!
        # plotit  returns an image of the correlation matrix ("image") or a
        #         histogram ("hist") or nothing ("no")
        # method  "normal" produces normally distributed y-values, any other binary
        # seed    set random seed; or NULL
        # scale   shall the data be scaled to mean=0 and sd=1? Defaults to FALSE.
        
        # returns a list with 6 data sets:
        # 1. train, 2. test.same, 3. test.more, 4. test.less, 5. test.nonlin and 6.
        # test.none with appropriately adapted collinearity structure.
        # The first variable, y, is the response and it is NOT scaled!
        # Variables X1:X5, X6:X10, ... form a cluster each; X21 is uncorrelated
        # with all variables.
        # The functional relationship between y and X is depicted in the attributes
        # of the list.


    require(mvtnorm)
    # helper functions: --------------------------------------------------------
    Xmaker <- function(k=k, N=N, decay=decay, ...){
        histcor <- function(X) hist(abs(cor(X)[upper.tri(cor(X))]))
        X1 <- runif(N)
        M <- matrix(rep(X1, k), ncol=k, byrow=F) # all X are identical
        colnames(M) <- paste("X", 1:k, sep="")
        # add noise:
        Mn <- M
        for (i in 1:k) Mn[,i] <- M[,i] + rnorm(N, 0, i*decay)
        if (plotit=="hist") histcor(Mn)
        if (plotit=="image") image(cor(Mn)[, ncol(cor(Mn)):1], col=rev(heat.colors(100))) #nrow(cor(Mn)):1
        #if( min(eigen(cor(Mn))$values) > 0) cat("Gut!!") #"\n"
        Mn
    }
    
    Data <- list(train=1, test.same=2, test.more=3, test.less=4, test.nonlin=5, test.none=6)
    
    # generate train/test.same data:
    if (!is.null(seed)) set.seed(seed)
    X <- matrix(ncol=21, nrow=2*N)
    decay.vec <- c(decay, decay*2, decay*3, decay*5)  # determines decays in clusters
    for (i in 1:4) {
      X[,(i*5-4):(i*5-4+4)] <- Xmaker(k=5, N=2*N, decay=decay.vec[i])
    }
    X[,21] <- runif(N)  # should be uncorrelated with all variables
    colnames(X) <- paste("X", 1:21, sep="")
    Data$train <- X[1:N,]
    Data$test.same <- X[(N+1):(2*N),]

    # generate test.more data:
    X3 <- matrix(ncol=21, nrow=N)
    decay.vec <- c(decay/2, decay*2/2, decay*3/2, decay*5/2)  # determines decays in clusters
    for (i in 1:4) {
      X3[,(i*5-4):(i*5-4+4)] <- Xmaker(k=5, N=N, decay=decay.vec[i])
    }
    X3[,21] <- runif(N)
    colnames(X3) <- paste("X", 1:21, sep="")
    Data$test.more <- X3
    
    # generate test.less data:
    X4 <- matrix(ncol=21, nrow=N)
    decay.vec <- c(decay*2, decay*2*2, decay*3*2, decay*5*2)  # determines decays in clusters
    for (i in 1:4) {
      X4[,(i*5-4):(i*5-4+4)] <- Xmaker(k=5, N=N, decay=decay.vec[i])
    }
    X4[,21] <- runif(N)
    colnames(X4) <- paste("X", 1:21, sep="")
    Data$test.less <- X4

    # generate test.nonlin data:
    X5 <- X[1:N,]
    for (i in 1:4) {
      for (j in 1:4){
          x <- X5[,(i*5-4+j)]
          X5[,(i*5-4+j)] <- x * (1-exp(-0.02*x))
      }
    }
    colnames(X5) <- paste("X", 1:21, sep="")
    Data$test.nonlin <- X5

    # generate test.none data:
    X6 <- rmvnorm(N, mean=rep(0, 21), sigma=diag(1, 21, 21))
    colnames(X6) <- paste("X", 1:21, sep="")
    Data$test.none <- X6

    # scale all X-data sets:
    if (scale) {Data.scaled <- lapply(Data, scale)} else {Data.scaled=Data}

    # make y for each Data set:
    
    
    # X <- Xmaker(N=2*N, ...)
#curve(2*x-1.5x*x, from=-2, to=2)
    f <- FUN
    if (method=="normal"){
      Ys <- lapply(Data.scaled, function(X) rnorm(n=nrow(X), mean=f(X), sd=y.noise) )
    } else {
      Ys <- lapply(Data.scaled,
        function(X) rbinom(nrow(X), size=1, plogis(rnorm(n=nrow(X), mean=f(X)-25, s=y.noise)) ))
    }

    Data.final <- Data.scaled
    attr(Data.final, "function") <- attr(f, "source")
    for (i in 1:length(Data))
        Data.final[[i]] <- cbind(y=Ys[[i]], Data.scaled[[i]])

    Data.final
}
#dats1 <- collineariser()#$uncortest[1:3,]
#image(cor(dats1$train))
#str(collineariser(decay=.1, N=10, y.noise=.2, method="binary", plotit="image"))


CPCA.fit <- function(alpha=0.1, method="AIC", thedata, ...){
      # CPCA requires two parameters: alpha (the contraint) and the number of components retained
      # searching for the best parameter combination requires considering the AICc of each option
      # first, we give an alpha value
      # then, we search for the best AICc/BIC number of components
      
      fm.cpca <- train.cpca(alpha=alpha, ...)
      fit <- NS <- 1:ncol(fm.cpca$coef)
      for (i in NS){
         pred.train.cpca <- as.matrix(thedata[,-1]) %*% fm.cpca$coef[,i]  
         fit[i] <- cor(pred.train.cpca, thedata[,1])^2 
      }
      n.comp.aicc <- which.min(AICc.r2(fit, 100, NS))
      n.comp.bic <- which.min(BIC.r2(fit, 100, NS))

      fm.cpca.aicc <- train.cpca(alpha=alpha, comps=n.comp.aicc, ...)
      fm.cpca.bic <- train.cpca(alpha=alpha, comps=n.comp.bic, ...)
      if (method=="AIC") return(cor(pred.cpca(fm.cpca.aicc, newdata=thedata), thedata[,1]) )
      if (method=="BIC") return(cor(pred.cpca(fm.cpca.bic, newdata=thedata), thedata[,1]) )
}
#CPCA.fit(0.4, data=train)


createLists <- function(df.x, proxyVector){ 
  # helper function for ivif
  # returns list with variable names and another with ids
  # author: Sven Lautenbach
	names <- vector("list", 0 )
	ids <- vector("list", 0 )
	for (i in (min(proxyVector):max(proxyVector)) )
	{
		pos <- length(ids) +1
    ids[[pos]] <- which(proxyVector==i)
		names[[pos]] <- names(df.x)[ids[[pos]] ]
	}
	res <- list()
	res$names <- names
	res$ids <- ids
	return(res)
}


detect <- function(var.set, pattern="*"){
      # function to detect a symbol or pattern in a set of variable names
      # returns the expression with the pattern
      # author: Carsten F. Dormann, October 2006
      index <- grep(pattern, var.set, fixed=T)
      var.set[index]
}


dr.direction.default.br <- function(object, which = 1:object$numdir, x = dr.x(object)) {
    # Function to sort out the signs in the dr methods
    # Does not work for ire method
    # BR, June 2008
    orig.ans <- (apply(dr.x(object), 2, function(x) {
        x - mean(x)
    }) %*% object$evectors)
    orig.sign <- orig.ans[1,] <= 0
    ans <- x
    for (i in 1:ncol(ans)){
       ans[,i] <- ans[,i] - mean(x[,i])
    }
    ans <- ans %*% object$evectors
    for (i in 1:ncol(ans)) {
       if (orig.sign[i]) ans[,i] <- -ans[,i]
    }
    ans <- ans[, which, drop=FALSE] # drop = FALSE necessary to preserve matrix structure
                                    # even if which equals 1
        dimnames(ans) <- list(attr(x, "dimnames")[[1]], paste("Dir",
            which, sep = ""))
    ans
}


formula.maker <- function(dataframe, y.col=1, quadratic=TRUE, interactions=TRUE){
  # makes a formula for GLM from dataframe column names, 
  # including quadratic effects and first-order interactions
  # by default, first column is taken to be the response (y); else, an integer giving the column with the response in "dataframe"
  # by Carsten F. Dormann
  if (quadratic && interactions) {
      f <- as.formula(paste(colnames(dataframe)[y.col], " ~ (", paste(colnames(dataframe[,-y.col]), collapse=" + ", sep=""), ")^2 + ", paste("I(", colnames(dataframe[,-y.col]), "^2)", collapse="+", sep="")))
  }
  
  if (quadratic & !interactions){
      f <- as.formula(paste(colnames(dataframe)[y.col], " ~ (", paste(colnames(dataframe[,-y.col]), collapse=" + ", sep=""), ") + ", paste("I(", colnames(dataframe[,-y.col]), "^2)", collapse="+", sep="")))
  }

  if (!quadratic & !interactions){
      f <- as.formula(paste(colnames(dataframe)[y.col], " ~ ", paste(colnames(dataframe[,-y.col]), collapse=" + ", sep="") ))
  }

  if (!quadratic & interactions){
      f <- as.formula(paste(colnames(dataframe)[y.col], " ~ (", paste(colnames(dataframe[,-y.col]), collapse=" + ", sep=""), ")^2"))
      # + ", paste("I(", colnames(dataframe[,-1]), "^2)", collapse="+", sep="")))
  }
  
  f
}


functioncutter <- function(dats){
    # transforms the function given in dats into an interpretable expression
    # by Carsten F. Dormann  
    funname <- strsplit(attr(dats, "function"), "function(X) 25+", fixed=TRUE)[[1]][2]
    funname <- gsub(x=funname, pattern="5\\*", replacement="")
    funname <- strsplit(funname, " # ", fixed=TRUE)[[1]][1]
    funname <- gsub("\\[", "", funname)
    funname <- gsub("\\]", "", funname)
    funname <- gsub(",", "", funname)
    funname <- gsub(" ", "", funname)
    for (i in 1:21){
      # replace squared variables by the appropriate term in the function call:
      funname <- sub(paste("X", i, "^2", sep=""), paste("I(X", i, "^2)", sep=""), funname, fixed=TRUE)
    }
#    funname <- sub("X4^2", "I(X4^2)", funname, fixed=TRUE) 
    funname <- gsub("-", "+", funname)   

    funname
}


getVIF <- function(theFormula, theData, mod, VIF=TRUE ){
  # helper function for ivif
  # author: Sven Lautenbach
	#print(theData)
	if (mod == "lm")
	{
	 #print(theFormula)
		theModel <- lm(formula = theFormula, data= theData)
		#print( length(theModel$model))

	}
	#paste("Length model: " , length(theModel$model))
	if ( length(theModel$model) <= 2)    # this has been changed to account
	{                                    # for situations then the first and the second variable are collinear
	  theVIF <- numeric(1)
	}
	else
	{
    theVIF <- vif(theModel)
 }
	if (is.null(colnames(theVIF)))
	{
		return (theVIF)
	}
	else
	{
		return(theVIF[,"GVIF"])
	}
}


giveWeightsVif <- function(X, cut=0.1, expon=log(nrow(X))) {
    # require(perturb)
    # by Tamara Mnkemller & Bjrn Reineking
    vifweight <- suppressWarnings(sapply(1:nrow(X), function(x) {max(colldiag(X[-x,])$condindx) ^expon}))
    weight <- vifweight/mean(vifweight)              
    #weight <- (vifweight-mean(vifweight))/sd(vifweight) #ranges weight between 0 and 1 #CFD
    #ifelse(weight < cut, 0, weight) #CFD
    return(weight)
}


ivif <- function (x,y, mod="lm", threshold=1.5, jumpThreshold=5, trace=FALSE) {
  ##################################################
  #
  # iterative variance inflation factor method
  #
  # References: Gordon D. Booth, Michael J. Niccolucci, Ervin G. Schuster (1994)
  # "Identifying proxy sets in multiple linear regression: an aid
  # to better coefficient interpretation"
  # Res. Pap. INT-470.  Ogden, UT:
  # U.S. Department of Agriculture, Forest Service, Intermountain Research Station; 1994. 12 p.
  #
  #
  # author: sven.lautenbach@ufz.de
  # date: 15.10.2007
  # last modified: 24.10.2008
  # bug fixed (use of j instead of currentModelIDs[j]) on 06.11.2007
  # bug fixed (forgot to reset newModel after first loop) on 12.12.2007
  # bug fixed (reintroduction of all variables from proxysets in the second step)
  #   on 23.10.2008
  ##################################################
  
  ###
  # Variance inflation factors are a scaled version of the multiple correlation coefficient between
  # variable j and the rest of the independent variables. Specifically,
  #      VIF(j) = 1/(1 - R(j)**2)
  # where Rj is the multiple correlation coefficient.
  # Variance inflation factors are often given as the reciprocal of the above formula.
  # In this case, they are referred to as the tolerances.
  # If Rj equals zero (i.e., no correlation between Xj and the remaining independent variables),
  # then VIFj equals 1. This is the minimum value. 
  ###
  
  ####
  # proxy set information is stored in a vector
  # the proxy sets vector relates to the xvar id by its position
  # and to the proxy set by its value
  # for example
  # xvar  		1 2 3 4 5
  # proxyset  1 1 2 3 2
  # for the construction of the model formula, the xvar id is used
  

  require(car)
	
  if (!(inherits(y, "data.frame"))) y <- as.data.frame(y)
  if (!(inherits(x, "data.frame"))) x <- as.data.frame(x)
  
	# check arguments
	if ( mod != "lm" && mod != "glm")
		{ return("The fit object is not of type lm or its inherited classes! Aborting")	}
	if (!(inherits(x, "data.frame")))
		{ return("The x object is not of type data.frame or its inherited classes! Aborting")	}
  if (!(inherits(y, "data.frame")))
		{ return("The y object is not of type data.frame or its inherited classes! Aborting")	}
	if (is.null( names(y) ) ) {	return("Please specify names for y") }
	if ( is.null(names(x) ) )	{	return("Please specify names for x")	}
	if (length(x[,1]) != length(y[,1]) ) { return("Data frames differ in size") }
	if (threshold <= 1.1) { return("Thresholds <= 1.1 should not be used") }

	theDF <- as.data.frame(cbind(y,x))
	currentModelIDs <- 1
	#currentModelIDs[2] <- 2
	###
	# the proxy sets is a vector indicating to which proxy set a x-variable belongs
	theProxySets <- rep(-1, length(x))

	# the ids of the x-vars which have been removed
	# is lateron needed by reinventing the xars in the model
	removedIDs <- vector("list",0)

	#create the vif for the first two variables in x
	prevFormula <- makeFormula(x,y,currentModelIDs)
	# the first variable belongs to the first proxy set
  theProxySets[1] <- 1


	## loop over all explanatory variables in y
	for (i in 2:length(x) )
	{
		newModelIDs <- currentModelIDs
    newModelIDs[length(newModelIDs)+1] <- i


		newFormula <- makeFormula(x,y,newModelIDs)
		newVIF <- getVIF(newFormula, theData = theDF, mod)

		if (trace) {print(newVIF)}

		# vergleich  newVIF mit threshold
		#ggf. modelIDs updaten, sonst in proxy set einordnen
		
		aboveTH <- FALSE
		theFirst <- -1 # indicates if it is the first VIF greater then the treshhold
											# or not. If not lists are merged and the first is the position
											# in the list of proxy sets there we other elemens should be connected to
		IDs2rm <- vector("list",0)
		count <- 1
		
		for (j in 1:length(currentModelIDs)) 	# loop over the variables in the current model and
		{   #  check if their VIF values have increased more then the threshold
			if (newVIF[j] > threshold)
			{
				aboveTH <- TRUE
				if (trace) { print(paste("rejected variable: ", names(x)[i],
								". VIF > threshold for ", names(x)[currentModelIDs[j]],sep=" ")) }
				if(theFirst == -1)
				{
					# set the proxy-id for the new var to the proxy id of the
					# var which VIF has jumped
        	theProxySets[i] <- theProxySets[currentModelIDs[j]]
					#store the proxy set id in case more then one variable has jumped
					theFirst <- theProxySets[ currentModelIDs[j] ]

					# store id in the list
					# if the id is not already in the list
					if (sum(removedIDs == i) == 0)
					{
          	removedIDs[[length(removedIDs) +1]] <- i
					}
				}
				else # there has been an vif > threshold before for the variable x_i
				{
          theProxySets[j] <- theFirst # was i before, 23.10.08 changed
          # store id in the list
					# if not already stored
          if (sum(removedIDs == currentModelIDs[j]) == 0) # changed from '== i' to 'currentModelIDs[j]'
					{
          	removedIDs[[length(removedIDs) +1]] <- currentModelIDs[j] # changed from '<- i' to '<- currentModelIDs[j]'
						if(trace)
						{	print(paste("marking variable", names(x)[currentModelIDs[j]], " for deletion from current model"))}
						#print("removedIDs:")
						#print(unlist(removedIDs))
					}
          
					#add the index to the list to be removed from the current model ids
					IDs2rm[[count]] <- currentModelIDs[j] # bug removed, was j before
					count = count +1
				}
        if (trace)
					{
						print("**** after update of proxy list ***")
	          print (paste(theProxySets,collapse=", "))
					}
			}
		}
		if (!aboveTH)
		{ 
			currentModelIDs[length(currentModelIDs)+1] <- newModelIDs[length(currentModelIDs)+1]
			if (trace) { print(paste("Model IDs updated with variable ", names(x)[i], sep=" ")) }
			#create new proxy set
      theProxySets[i] <- max(theProxySets) +1
   	}
		else if (length(IDs2rm) > 0)
		{ # delete the x variables indices from the model
			# if lists have been combined
			#print(currentModelIDs)

			for (k in 1:length(IDs2rm) )
			{
				if ( length( which( currentModelIDs == IDs2rm[[k]] ) ) > 0)
					{ currentModelIDs <- currentModelIDs[ - which(currentModelIDs == IDs2rm[[k]])]
					}
				else
					print("Should never happen.... check source code...")
				print(which(currentModelIDs == IDs2rm[[k]]))
			}
      #IDs2rm <- vector("list",0)
		}

  	prevFormula <- makeFormula(x,y,currentModelIDs)
		prevVIF <- getVIF(prevFormula, theData = theDF, mod)

	}
   newModelIDs <- currentModelIDs # new checking
	####
	# loop over removed ids list
	# and reinvent the removed x-vars
	# to see if there are jumps in the VIFs
	# step 3B and 3C in Booth et al. 1994
	###
	for (rID in removedIDs)
	{
    
		pos <- length(newModelIDs)+1
    
		if ((rID > -1) & (sum(newModelIDs == rID) == 0)) # changed 23.10.2008 , added & (sum(newModelIDs == rID) == 0)
		{ 
		  newModelIDs[pos] <- rID # changed 23.10.08 was outside block before
	    newFormula <- makeFormula(x,y,newModelIDs)
			#print(newFormula)
			newVIF <- getVIF(newFormula, theData = theDF, mod)
			#print("newVIF:")
      #print(newVIF)
			#print(prevVIF)
			relVIF <- newVIF[-pos] / prevVIF
			if (trace)
			{ print(paste("Reinventing ", names(x)[rID]))
				print ("VIFs which excessed jump-threshold:")
				print (which(relVIF > jumpThreshold))
				print(newVIF)
			}
			# set the proxy set id for the xvars which jumped
			# to the id of the proxy set there the newly reinmvented variable belongs to
			# 	which(relVIF > jumpThreshold) -> the current-model-vars which jumped
			#   newModelIDs[which(relVIF > jumpThreshold) ] -> the ids of the original xvars
			#   theProxySets[ newModelIDs[which(relVIF > jumpThreshold) ] ] -> the proxyset ids of the selected xvars
			# 	theProxySets %in% theProxySets[ newModelIDs[which(relVIF > jumpThreshold) ] ] -> all xvars which belong to the same proxy set
			theProxySets <- ifelse( theProxySets %in% theProxySets[ newModelIDs[which(relVIF > jumpThreshold) ] ] ,
						theProxySets[rID] , theProxySets)

			prevVIF <- newVIF
		}
    
	}

	result <- vector("list",0)
	result$XIDs <- theProxySets
	
	res <- createLists(x, theProxySets)
	# changed 23.10.2008
	# store only the names and ids for non empty lists
	# I am sure there is a nicer way out there to do it...
	result <- vector("list", 0)
	result$ids.grouped <-vector("list", 0)
	result$names.grouped <-vector("list", 0)
	count <- 1
	for(ii in 1:length(res$ids))
	{
	  if (length(res$ids[[ii]]) != 0)
	  {
	    
	    result$ids.grouped[[count]] <- res$ids[[ii]]
	    result$names.grouped[[count]] <- res$names[[ii]]
	    count <- count +1
	  }
	}
	return(result)

}


makeFormula <- function( x, y, idVector, combine= " + "){
  #create a formula out of x and y vector
  # helper function for ivif
  # for x take only the columns that are specified in the ID vector
  # author: Sven Lautenbach

	theStringY = paste( names(y), collapse = combine)
 	theStringX = paste( names(x[idVector]), collapse = combine)
	theString = paste (theStringY, theStringX, sep = " ~ ")
	return (as.formula(theString))
}


newPCclusters <- function(clusters, train, test){
        # function to transform test data in the same way as training
        # written by CFD, 25 Aug 2008
        newtest <- matrix(ncol=length(clusters), nrow=nrow(test))
        colnames(newtest) <- LETTERS[1:length(clusters)]
        for (i in 1:length(clusters)){
		  	 	if (length(clusters[[i]])==1){
		         addvar <- test[,clusters[[i]], drop=TRUE]         
		  		} else {
  		  		  pca <- prcomp(train[,clusters[[i]]], scale=T)
              addvar <- predict(pca, newdata=test)[,1]
          }    
          newtest[,i] <- addvar         
	 	  		if (length(clusters[[i]])==1){
             colnames(newtest)[i] <- clusters[[i]]
            } else { 
              colnames(newtest)[i] <- paste("PCcluster", i, sep="")
            }
  		  }
  		  as.data.frame(newtest)
}


PCAclusteringbackward <- function(cdata, threshold=0.32){
    # function to get cluster grouping through PCA
    # threshold specifies the correlation at which variables loading on a PC should be regarded as belonging to the same cluster
    # goes from last to first PC (suggested during the workshop: reference???)
    # returns a list of variable names; each entry is a cluster
    #
    # method as communicated by PO and PL, implemented by CFD, Juni 2008
    # NOT ADVISABLE! use PCAclusteringforward unless you have a good reason not to!
    PCA <- prcomp(cdata, scale=T)
    clusters <- list() # the list of clusters
    clusterID <- 1 # the counter for the clusters
    newPCA <- PCA$rotation # to store the reduced set of PCs
    for (i in length(PCA$sdev):1){
    #    if (is.null(dim(newPCA))){
    #      clusters[[clusterID+1]] <- 
    #      break
    #    } # breaks when all but one variable are used up
        clu <- names(which(abs(newPCA[,i])>threshold)) # find variables with high loading
        if (length(clu)>0){ # 
          clusters[[clusterID]] <- clu # store results
          clusterID <-  clusterID + 1  # count one up if clusters were found
        } else next # jump to next value of i
        newPCA <- newPCA[-(which(rownames(newPCA) %in% unique(unlist(clusters)))), , drop=FALSE]  
    }
    clusters[[length(clusters)+1]] <- rownames(newPCA)
    clusters
}
# example:
#backwardPCAclustering(cdata[,-1])


PCAclusteringforward <- function(cdata, threshold=0.32){
    # function to get cluster grouping through PCA
    # threshold specifies the correlation at which variables loading on a PC should be regarded as belonging to the same cluster
    # goes from first to last PC
    # returns a list of variable names; each entry is a cluster
    #
    # method as communicated by PO and PL, implemented by CFD, Juni 2008
    
    PCA <- prcomp(cdata, scale=T)
    clusters <- list() # the list of clusters
    clusterID <- 1 # the counter for the clusters
    newPCA <- PCA$rotation # to store the reduced set of PCs
    for (i in 1:length(PCA$sdev)){
    #    if (is.null(dim(newPCA))){
    #      clusters[[clusterID+1]] <- 
    #      break
    #    } # breaks when all but one variable are used up
        clu <- names(which(abs(newPCA[,i])>threshold)) # find variables with high loading
        if (length(clu)>0){ # 
          clusters[[clusterID]] <- clu # store results
          clusterID <-  clusterID + 1  # count one up if clusters were found
        } else next # jump to next value of i
        newPCA <- newPCA[-(which(rownames(newPCA) %in% unique(unlist(clusters)))), , drop=FALSE]  
    }
    clusters[[length(clusters)+1]] <- rownames(newPCA)
    clusters
}
# example:
#forwardPCAclustering(cdata[,-1], threshold=0.32)


pplsoptim <- function(parms=c(12, 10), train.data){
       # helper function for the optimsation procedure on PPLS
          nk = round(parms[1])
          lambda = round(parms[2])
          fm.ppls <- train.ppls(train.data, nk=nk, lambda=lambda)         
          # this step in the analysis reduces the number of components:
          fit <- NS <- 1:ncol(fm.ppls$ppls$coefficients)
          for (i in NS){
              pred.train.ppls <- fm.ppls$transf.data %*% fm.ppls$ppls$coefficients[,i]  # calculates predicted values 
              fit[i] <- cor(pred.train.ppls, train.data[,1])^2 # calculates R2 for train data
          }
          n.comp <- which.min(AICc.r2(fit, 100, NS)) #calculates AIC for all fits and finds most parsimonious model
          fm.ppls$comps <- n.comp # derived number of components is stored
            
          predicted.ppls <- pred.ppls(fm.ppls, newdata=train.data)
          cor(predicted.ppls, train.data[,1])^2
}


pred.cpca  <-  function(object, newdata){
  # prediction function for Constrained Principal Component Analysis
  # author: Gudrun Carl
   Xtest <- as.matrix(newdata[, -1, drop=FALSE])
   Ytest <- newdata[, 1]
#  Xtest <- scale(Xtest) # NEE! Mit den train-Daten skalieren!
#  Ytest <- scale(Ytest) # NEE! Mit den train-Daten skalieren!
# CFD:
  mean.mat <- matrix(attr(object$Xtrain, "scaled:center"), ncol=ncol(object$Xtrain), nrow=nrow(Xtest), byrow=TRUE)
  sd.mat <- matrix(attr(object$Xtrain, "scaled:scale"), ncol=ncol(object$Xtrain), nrow=nrow(Xtest), byrow=TRUE)                                     
  Xtest <- (Xtest-mean.mat)/sd.mat #standardised by train data!
  Ytest <- (Ytest - object$ymean)/object$ynorm

  b.train <- object$coef
  comps <- object$comps
  ymean <- object$ymean
  ynorm <- object$ynorm
  predcpca <- Xtest %*% b.train
  predcpca <- predcpca*ynorm + ymean
  predcpca
}


pred.lrr  <-  function(object, newdata){
  # prediction function for latent root regression
  # author: Gudrun Carl
  Xtest <- as.matrix(newdata[, -1])
  Ytest <- as.matrix(newdata[, 1])
# CFD:
#  Xtest <- scale(Xtest)  # NEE!
  mean.mat <- matrix(attr(object$Xtrain, "scaled:center"), nrow=nrow(Xtest), ncol=ncol(Xtest), byrow=TRUE)
  sd.mat <- matrix(attr(object$Xtrain, "scaled:scale"), nrow=nrow(Xtest), ncol=ncol(Xtest), byrow=TRUE)
  Xtest <- (Xtest-mean.mat)/sd.mat #standardised by train data!

  X <- Xtest/sqrt(sum(diag(t(Xtest)%*%Xtest)))  #sum(diag(.)) instead of matrix.trace? faster!

  b.train <- object$coef
  comps <- object$comps
  LOAD <- object$LOAD
  ymean <- object$ymean
  ynorm <- object$ynorm
  # inserted by Carsten to use only selected components:
  selected.components <- as.numeric(substring(names(object$coef), first=6))
  predlrr <- (X %*% LOAD)[, selected.components] %*% b.train

  predlrr <- predlrr*ynorm + ymean #de-standardise
  as.numeric(predlrr)
}


pred.pcr  <-  function(object, newdata){
  # principal components regression (pcr)
    # prediction function for train.pcr
    # author: Gudrun Carl
    beta <- object$coef
    family <- object$family
    k <- length(beta)-1
    
    #create formula with intercept
    ding  <-  paste("X",  1:k,  sep="")
    form <- as.formula(paste("y~", paste(ding, collapse="+")))
    
    Xtest <- model.matrix(form, as.data.frame(newdata))
    lin <- Xtest%*%beta
    if(family=="gaussian") pred <- lin
    if(family=="binomial") pred <- exp(lin)/(1+exp(lin))
    if(family=="poisson")  pred <- exp(lin)
    pred
}


pred.pls  <-  function(object, newdata){
  # partial least squares (PLS) wrapper
    # prediction function for train.pls
    # author: Gudrun Carl

  family <- object$family
  comps <- object$comps
  Xtest <- as.matrix(newdata[, -1])
  Ytest <- newdata[, 1]
  if (family=="gaussian"){
    require(ppls)
    predpls <- new.penalized.pls(object$pls, Xtest)$ypred
    predpls <- predpls[, comps]
  }
  if (family=="binomial"){
    require(gpls)
    predpls <- predict(object$pls, Xtest)
    predpls <- predpls$pred
  }
  predpls
}


pred.ppls  <-  function(object, newdata){
  # penalised partial least squares (PPLS) wrapper
  # prediction function for train.ppls
  # author: Gudrun Carl
  require(ppls)
  Xtest <- as.matrix(newdata[, -1])
# CFD:
#  Xtest <- scale(Xtest)  # NEE!
  mean.mat <- matrix(attr(object$Xtrain, "scaled:center"), nrow=nrow(Xtest), ncol=ncol(Xtest), byrow=TRUE)
  sd.mat <- matrix(attr(object$Xtrain, "scaled:scale"), nrow=nrow(Xtest), ncol=ncol(Xtest), byrow=TRUE)
  Xtest <- (Xtest-mean.mat)/sd.mat #standardised by train data!

  Ytest <- newdata[, 1]
  Xtrain <- object$Xtrain
  comps <- object$comps
  nknot <- object$nknot
  dummy <- X2s(Xtrain, Xtest, deg=3, nknot=nknot) 
  # transformed X data
  # Z <- dummy$Z 
  # transformed Xtest data
  Ztest <- dummy$Ztest 
  # prediction for test data
  predppls <- new.penalized.pls(object$ppls, Ztest)$ypred
  predppls[, comps]
}


predict.lasso <- function(object, newdata){
      # simple prediction function for LASSO and ridge
      # CFD, 27 Aug 2008
      if (colnames(newdata)[1]=="y") newdata=newdata[,-1]
      COEFS <- coefficients(object)
      newdata.lasso <- matrix(ncol=length(COEFS), nrow=nrow(newdata))      
      colnames(newdata.lasso) <- names(COEFS)
      ints <- grep(":", names(COEFS), fixed=TRUE)
      quads <- grep("I(", names(COEFS), fixed=TRUE)
      main <- which(!(1:length(COEFS) %in% c(ints, quads)))[-1]
      if (length(main)==0) return(rep(COEFS[1], times=nrow(newdata)))
      newdata.lasso[, 2:(length(main)+1)] <- as.matrix(newdata[, (main-1)])
      attach(newdata)
      if (length(ints)>0) {
        for (i in ints){
            a <- get(strsplit(names(COEFS[i]),":")[[1]][1])
            b <- get(strsplit(names(COEFS[i]),":")[[1]][2])  
            newdata.lasso[,i] <- a*b          
        }
      }
      if (length(quads)>0){
        for (j in quads){#
            Q <- get(strsplit(substring(names(COEFS[j]), 3), "^", fixed=TRUE)[[1]][1])
            newdata.lasso[,j] <- Q*Q
        }
      }
      detach(newdata)
      newdata.lasso[,1] <- 1 # intercept

      #newdata.lasso <- cbind("(Intercept)"=1, newdata[, names(coefficients(object))[-1]])
      as.numeric(as.matrix(newdata.lasso) %*% coefficients(object))
}


predict.seqreg <- function(object, newdata, family=gaussian){ #, type="new"
    # calculates transformed data for new data based on seqreg object
    #
    # in "new", the sequence of variable importances is taken from the training data,
    # but the sequential regressions are carried out anew on the new data; the logic
    # here is that we interpret the contribution of a variable AFTER accounting for the
    # effect of those earlier in the sequence, but the data are themselves more or less
    # uncorrelated
    # in "old", the coefficients of the sequential regression of the training data is used
    # to predict onto the new data; the interpretation is not clear to me
    #
    # last change: sven.lautenbach@ufz.de introduced the family argument to predict.seqreg
    # in line 963 and 981
    #
    # which variables are still in the model?:#
          
#    if (type=="new"){
        newdata.transf <- seqreg(response=newdata[,1], X=newdata[,-1], 
          sequence=match(names(object$model$data[-1]), colnames(newdata[,-1])), family=family)[[1]]
#    }
#        
#    if (type=="old"){
#        newdata.sort <- newdata[names(object[[1]])]
#        coefs <- object[[2]]
#        newdata.transf <- newdata.sort
#        for (i in 1:length(coefs)){
#            newdata.transf[, i+1] <- as.numeric(newdata.sort[, i+1] - cbind(1, as.matrix(newdata.sort[,1:i])) %*% coefs[[i]])
#        }
#    }
#            
    cbind.data.frame(y=newdata[,1], newdata.transf)
}


seqreg <- function(response, X, family="gaussian", univar="gam", sequence=NULL){
    # returns a table with residuals of cumulative regression; sequence of variables is
    #       determined by their marginal importance (or by "order", if given)
    #
    # response is the response variable
    # data is a matrix or data.frame of explanatory variables
    # family specifies the error distribution of the response
    # sequence can be a vector of length ncol(data) representing the sequence in which
    #       variables will be arranged for residual regression
    #
    # authors: Jaime Garcia and Carsten F. Dormann, 10.10.2006
    # changed by  Tamara Mnkemller and Damaris Zurell, 12.12.2007
    # last changed by Damaris Zurell, 14.03.2008
    # last changed by Carsten F. Dormann, 13.6.2008
		# last changed by Sven Lautenbach, 04.12.2009 -> switch for negativ.binomial vs negbin
    #'''''''''''''''''''''''''''''''''''''''''''''''''''''''
    require(mgcv)
    # consider special case of negative binomial function
    # gam comes with a special implementation of negative.binomial
    # called negbin
    if( class(family) == "family")
    {
	    if ((substring(family$family, 1, 3) == "Neg" & univar=="gam"))
			{
				# extract theta parameter
				tmp1 <- strsplit(family$family, "(", fixed=TRUE)[[1]][2]
				theta <- strsplit(tmp1, ")", fixed=TRUE)[[1]][1]
				family=paste("negbin(", theta, ")", sep="")

			}
		}
    # helper functions:

    var.imp <- function (variable, response, univar=univar, family="gaussian")
    {   # calculates the univariate (=marginal) importance of a variable for  response, using either glm or (shrinkage) GAM
          if (univar=="glm"){
            fm.glm <- glm(response ~ variable, family=family)
            summary(fm.glm)$aic
          } else {
            if (univar=="gam"){
              fm.gam <- gam(response ~ s(variable, bs="cs", k=5), family=family)
            	AIC(fm.gam)
            } else return(F)
          }
    }

    cumResid <- function(X)
    {     # returns a table with residuals of cumulative regression
          # requires a SORTED table of explanatory variables
          # data MUST BE continuous variables!
          seqreg.rs <- X                   # repeat new.exdata to delete cols 2:5
          coefs <- list()

          for (i in 1:(ncol(X)-1))
          {
            righthandside <- paste(colnames(X)[1:i], collapse=" + ", sep=" ")  # extract the number of the predictor variables
            resp <- X[,i+1]
            form <- formula(paste("resp ~ ", righthandside, sep=""))
            fm.glm <- lm(form, data=seqreg.rs)
            seqreg.rs[,i+1] <- residuals(fm.glm)
            coefs[[i]] <- coefficients(fm.glm)
          }
          return(list(seqreg.rs, coefs=coefs))
    }

    #'''''''''''''''''''''''''''''''''''''''''''''''''''''''
    a <- try(var.imp(X[,1], response, univar=univar))
    if (is.numeric(a)!=1) {stop("invalid univar method")}
    if (is.null(sequence)) {
        sort.varib <- names(sort(apply(X, 2, var.imp, response=response, family=family,univar=univar), decreasing=F))
        } else { sort.varib  <- colnames(X)[sequence]}

    new.data <- X[, sort.varib]      # New data frame with values and with new order

    cumResid(X=as.data.frame(new.data))    
}


select07 <- function(X, y, family="binomial",univar="gam", threshold=0.7, method="pearson", sequence=NULL, ...){
    # selects variables based on removing correlations > 0.7, retaining those
    # variables more important with respect to y
    # when a sequence is given, this will be used instead (Damaris)

    # 1. step: cor-matrix
    # 2. step: importance vector
    # 3. step: identify correlated pairs
    # 4. step: remove less important from pairs
    #get rid of it II: seqreg, select07,maxspan
    # written by Carsten F. Dormann;
    # last changed by  Tamara Mnkemller and Damaris Zurell, 12.12.2007
    # last changed by Damaris Zurell, 19.12.2008
    # last changed by Sven Lautenbach, 04.12.2009 -> switch for negativ.binomial vs negbin
    require(mgcv)

    var.imp <- function (variable, response, univar=univar, family="gaussian"){
        # calculates the univariate (=marginal) importance of a variable for a response
        if (!univar %in% c("glm", "gam")) stop("Invalid univariate screening method: choose 'glm' or 'gam' (default).")
        
        if (univar=="glm"){
          fm.glm <- glm(response ~ variable, family=family)
          summary(fm.glm)$aic
        } else {
          # consider special case of negative binomial function
			    # gam comes with a special implementation of negative.binomial
			    # called negbin
			    if( class(family) == "family")
    			{
				    if ((substring(family$family, 1, 3) == "Neg" & univar=="gam"))
						{
							# extract theta parameter
							tmp1 <- strsplit(family$family, "(", fixed=TRUE)[[1]][2]
							theta <- strsplit(tmp1, ")", fixed=TRUE)[[1]][1]
							family=paste("negbin(", theta, ")", sep="")
						}
					}
          fm.gam <- gam(response ~ s(variable, ...), family=family)
          AIC(fm.gam)
        }
    } 
	
    cm <- cor(X, method=method)
    pairs <- which(abs(cm)>= threshold, arr.ind=T) # identifies correlated variable pairs
    index <- which(pairs[,1]==pairs[,2])           # removes entry on diagonal
    pairs <- pairs[-index,]                        # -"-
    exclude <- NULL
    if (NROW(pairs)!=0)
    {
        if (is.null(sequence)) {
          #importance as AIC: the lower the better!
          imp <- apply(X, 2, var.imp, response=y, family=family, univar=univar)           
          for (i in 1:NROW(pairs))
          {
              a <- imp[rownames(cm)[pairs[i,1]]]
              b <- imp[rownames(cm)[pairs[i,2]]]
              exclude <- c(exclude, ifelse(a>b, names(a), names(b)))
          }
        } else { 
          for (i in 1:NROW(pairs))
          {
              a <- which(pairs[i,1]==sequence)
              b <- which(pairs[i,2]==sequence)
              exclude <- c(exclude, ifelse(a>b, rownames(cm)[pairs[i,1]], rownames(cm)[pairs[i,2]]))
          }
        }
    }
    X <- X[,!(colnames(X) %in% unique(exclude)),drop=F]
    return(X)
}
#select07(X=LE, y=LanExc12[,"c"], family="binomial", threshold=0.7, method="spearman")[1:10,]



step.seqreg <- function(data, y.col=1, k=2, quadratic=TRUE, interactions=TRUE, family="gaussian", ...){
    # in seqreg, the data need to be re-calculated every time a deletion took place;
    # this requires a very different type of coding the stepwise selection process
    #
    # object is a seqreg-object on the full dataset
    # data   is the data.frame used in the initial seqreg
    # y.col  is the column-number of the response
    # by CFD, Aug. 2008
    
    newdata=data
    
    # step 1: run a seqreg
    step1 <- function(...){
      seqreg(...)[[1]]
    }
    
    # step 2: run a glm on seqreg.data1
    step2 <- function(exclude, data, ...){   
      f <- formula.maker(data, y.col=1, quadratic=quadratic, interactions=interactions)
      if (!is.null(exclude)) {
          f <- update(f, paste(" ~ . ", paste(exclude, collapse=" - "), sep=" - "))
          #f <- as.formula(paste(paste(as.character(f)[c(2,1,3)], collapse=""), "-", paste(exclude, collapse="-"), sep=""))
      }
       glm(formula=f, data=data, ...)
    }   
    
    #initial d-values to get into the while-loop:
    d <- list()
    d$AIC <- c(1,0)
         
    while (d$AIC[1] >= min(d$AIC[-1])){ # as long is null model is not best
      #step1
      seqreg.data1 <- step1(response=newdata[,y.col], X=newdata[,-y.col, drop=TRUE], family=family, ...) 
      mainremoved=FALSE
      exclude <- c()
      while (!mainremoved | d$AIC[1] >= min(d$AIC[-1])){     
        # step2
        reduced.data <- cbind(y=newdata[,y.col, drop=FALSE], seqreg.data1)
        fm <- step2(exclude, data=reduced.data, family=family, ...)
        # step3: find least important variable in the model
        d <- drop1(fm, k=k)
        # step 4: was dropped a main effect also part of an interaction?
        candidates <- rownames(d)[-1][order(d$AIC[-1], decreasing=FALSE)]
        if (length(detect(candidates[1], ":"))==0 & length(detect(candidates[1], "^2"))==0){ mainremoved=TRUE} # still part of an interaction!
        if (candidates[1]=="<none>") break
        exclude <- c(exclude, candidates[1]) 
      }
      throughout <- which(colnames(newdata)==candidates[1])
      if (!(length(throughout)==0) & d$AIC[1] >= min(d$AIC[-1])) newdata <- newdata[,-throughout] 
      if (ncol(newdata)==2) break   
    }
    list(model=fm, data=newdata)
}


train.cpca  <-  function(data, comps=NULL, alpha=0.1){
  # runs the Constrained Principal Component Analysis
  # 0 <= alpha <= 1
  # if alpha=0 then cpca is equivalent to pcr
  # if alpha=1 then cpca is equivalent to ols
  # written by Gudrun Carl, 2008
  
  Xtrain <- as.matrix(data[, -1])
  Ytrain <- as.matrix(data[, 1])
  k <- ncol(Xtrain)
  n <- nrow(Xtrain)
#  ymean <- mean(Ytrain)
#  ynorm <- sd(Ytrain) #as.numeric(sqrt(t(Ytrain-ymean)%*%(Ytrain-ymean)/(n-1)))
  
  Xtrain <- scale(Xtrain)
  Ytrain <- scale(Ytrain)
  
  if (alpha!=0 & alpha!=1) {
    al1 <- (1-alpha)*Xtrain    # multiply all X-values by 1-alpha
    al2 <- alpha*Ytrain        # multiply all y-values by alpha
    A <- cbind(al1, al2)       # make new matrix with X and y (in this sequence!)
    A <- as.matrix(A)
  }
  
  if (alpha==0) A <- Xtrain
  if (alpha==1) A <- Ytrain
  
  V.X <- t(Xtrain) %*% Xtrain
  V.XA <- t(Xtrain) %*% A
  V.AX <- t(A) %*% Xtrain
  s <- svd(V.X)
  d <- diag(1/sqrt(s$d))
  V.Xm12 <- s$u %*% d %*% t(s$u)
  U <- eigen(V.Xm12 %*% V.XA %*% V.AX %*% V.Xm12)$vector
  if (is.null(comps)){
  #  U <- U[, 1:comps]       # outcommented: CFD
    # added by CFD:
    b.mat <- matrix(nrow=ncol(Xtrain), ncol=ncol(Xtrain))
    for (i in 1:ncol(U)){ # CFDs version
    # as in PPLS, all components are successively used, and AICc is later used to determine the "best" number of components
        U2 <- U[, 1:i]
        b.train <- V.Xm12 %*% U2 %*% t(U2) %*% t(V.Xm12) %*% t(Xtrain) %*% Ytrain  
        b.mat[,i] <- b.train
    }   
   }
   if (!is.null(comps)){ # GC original
      U <- U[, 1:comps]
      b.train <- V.Xm12 %*% U %*% t(U) %*% t(V.Xm12) %*% t(Xtrain) %*% Ytrain
      b.mat <- b.train
   }
  
  fit <- list(coef=b.mat, alpha=alpha, comps=comps, ymean=attr(Ytrain, "scaled:center"), ynorm=attr(Ytrain, "scaled:scale"), Xtrain=Xtrain)
  fit
}


train.lrr  <-  function(data, comps=(ncol(data)-1)){
  # latent root regression
  # data is a data.frame with the response in the first column
  # comps is an optional number of latent variables to be extracted; defaults to as many as there are variables and is "re-adjusted" in the output
  # written by Gudrun Carl, 2008
#  require(matrixcalc) # only for matrix.trace!

  Xtrain <- as.matrix(data[, -1])
  Ytrain <- as.matrix(data[, 1])
  k <- ncol(Xtrain)
  n <- nrow(Xtrain)
  ymean <- mean(Ytrain) 
  ynorm <- as.numeric(sqrt(t(Ytrain-ymean)%*%(Ytrain-ymean))) # what's that? Why not use sd?
  Xtrain <- scale(Xtrain)
  Ytrain <- as.vector(scale(Ytrain))
  X <- Xtrain/sqrt(sum(diag(t(Xtrain)%*%Xtrain)))         #matrix.trace
  y <- Ytrain/sqrt(t(Ytrain)%*%Ytrain)
  Xold <- X
  yold <- y
  V <- t(X)%*%X
  
  LT <- matrix(0, n, k)
  LP <- matrix(0, k, k)
  for( it in 1:k){
    l <- eigen(V)$value
    P <- eigen(V)$vector
    Z <- X%*%P
    A <- cbind(y, Z)
    lambda <- eigen(t(A)%*%A)$value
    gamma <- eigen(t(A)%*%A)$vector
    #dim(Z)
    gamma.null <- gamma[1, ]/gamma[1, 1]
    gamma.null <- gamma.null[-1]
    LT[, it] <- Z%*%gamma.null
    LT[, it] <- LT[, it]/sqrt(sum(LT[, it]^2))
  
    lp <- t(t(LT)%*%X)   # LT = score,   LP= loadings
    LP[, it] <- lp[, it]
    my <- lm(y~LT[, it]-1)
    resy <- resid(my)
    resx <- matrix(NA, n, k)
    for ( i in 1:k){
      x <- X[, i]
      mx <- lm(x~LT[, it]-1)
      resx[, i] <- resid(mx)
    }
    resxsc <- scale(resx)
    resysc <- as.vector(scale(resy))
    X <- resxsc
    y <- resysc
    X <- resxsc/sqrt(sum(diag(t(resxsc)%*%resxsc)))      #matrix.trace
    y <- resysc/sqrt(t(resysc)%*%resysc)
  
    V <- t(X)%*%X
  }
  
  X <- Xold
  y <- yold
  
  LOAD <- LP[, 1:comps]%*%solve(t(LP[, 1:comps])%*%LP[, 1:comps])
  XLOAD <- X%*%LOAD
  dimn  <-  paste("XLOAD",  1:comps,  sep="")
  colnames(XLOAD) <- dimn
  #create formula without intercept
  form <- as.formula(paste("y~", paste(dimn, collapse="+"), "-1"))
  
  dataload <- data.frame(y, XLOAD)
  mload <- step(lm(formula=form, dataload), k=2, trace=FALSE) # Carsten put the step in here
  b <- coef(mload)
  #selected.components <- which(colnames(XLOAD) %in% names(b)) #only store components that were selected during the stepwise regression
  
  #Carsten added:
  comps <- length(b) # the idea here is that the step selects how many latent variables are required
  
  fit <- list(coef=b, LOAD=LOAD, LT=LT, LP=LP, formula=mload$call$formula, comps=comps, ymean=ymean, ynorm=ynorm, Xtrain=Xtrain)
  fit
}


train.pcr   <-   function(data, family, comps){
  # principal components regression (pcr)
    # runs the PCR
    # author: Gudrun Carl
    Xtrain <- as.matrix(data[, -1])
    Ytrain <- as.matrix(data[, 1])
    
    a  <-  eigen(t(Xtrain)%*%Xtrain)$vector
    aa <- a[, 1:comps]
    
    Ftrain <- Xtrain%*%aa
    Fdata <- data.frame(Ytrain, Ftrain)
    i  <-  1:comps
    namvar  <-  paste("Ftrain", i, sep="")
    dimnames(Fdata)[[2]]  <-  c("Ytrain", namvar)
    formula  <-  as.formula(paste("Ytrain~", paste(namvar
     , collapse="+")))
    
    # fit for glm 
    mglm  <-  glm(formula, family, Fdata)
    btrain  <-  c(mglm$coef[1], aa%*%mglm$coef[-1])
    
    fit  <-  list(glm=mglm, coef=btrain, formula=formula, family=family)
    fit
}


train.pls <- function(data, family="gaussian", ...){
  # performs a partial least square regression; by choosing ncomp (for gaussian data) or K.prov (for other families), one can specify the number of latent variables to be forced
    # runs the PLS
    # author: Gudrun Carl
  Xtrain <- as.matrix(data[, -1])
  Ytrain <- data[, 1]
  if (family=="gaussian"){
    require(ppls)
    # fit for ppls (without penalized)
    mpls <- penalized.pls(Xtrain, Ytrain, ...) 
  }
  if(family=="binomial"){
    require(gpls)
    # fit for gpls 
    mpls <- gpls(Xtrain, Ytrain, ...) 
  }
  fit <- list(pls=mpls, family=family)
  if (exists("comp")) fit$comp <- comp
  fit
}


train.ppls  <-  function(data, comps=(ncol(data)-1), nk=12, lambda=50){
  # wrapper to run the PPLS
  # author: Gudrun Carl
  require(ppls)
  Xtrain <- as.matrix(data[, -1])
  Xtrain <- scale(Xtrain)
  Ytrain <- data[, 1]
  nknot <- rep(nk, ncol(Xtrain))
  # transformation of the data
  dummy <- X2s(Xtrain, deg=3, nknot=nknot) # spline-transforms of the original data
  # transformed X data
  Z <- dummy$Z 
  # transformed Xtest data
  # Ztest <- dummy$Ztest 
  # size of the transformed data                                    
  size <- dummy$sizeZ
  # Penalty matrix
  P <- Penalty.matrix(size, order=2)  
  # fit for ppls (with penalized) 
  mppls <- penalized.pls(Z, Ytrain, P=lambda*P, ncomp=comps)

  fit <- list(ppls=mppls, transf.data=Z, Xtrain=Xtrain, comps=comps, nknot=nknot) # CFD: Z hinzugefgt
  fit
}


varcluster <- function(X, similarity="hoeffding", hmethod='ward', threshold=0.1, varselect='centered', response=NULL, plot.clustertree=FALSE)
{
	# This is a modification of varclus.select to return a list of clusters (CFD)
  # varclus.select first employs varclus to build a cluster tree, then uses the user-defined threshold to select the clusters, and finally the varselect method to choose a variable to represent the cluster.
	#
	#similarity="hoeffding" (Default)
	#hmethod='ward' (Default)
	# varselect = c("centered", "explained", "summarised"): Default="centered"
	# threshold: set level of acceptable collinearity; NOTE: This has different meanings in different similarities!! A value of 0.7 with Spearman means actually a rho^2 of 0.7 (i.e. a correlation of 0.84). 
  #       Suggestions for thresholds: "hoeffding": 0.1     (DEFAULT)
  #                                   "spearman" and "pearson":  0.49
  #
  # Core code by CFD, streamlining of the threshold-issues by BL and GC.
  # Note: I re-modified it to use original varclus {Hmisc}, not varclus2!
  #       Hence, only this function is needed to identify clusters.
  require(Hmisc)
  
	  N <- ncol(X)
    excluded <- NULL # to take the excluded variables
    cr <- matrix(nrow=0, ncol=2); colnames(cr)<- c("cluster", "var.name") #clusterrepresentative
    
    v <- varclus(as.matrix(X), similarity=similarity, method=hmethod)
    #plot(v)
    
    h <- v$hclust$merge # sequence in cluster diagram
    s <- v$sim          # correlation^2-matrix
    vec <- rep(0,nrow(h)) # to hold the correlations between two any two variables next to each other in the cluster diagram
    vecs <- list(c())  # a list of variables forming a cluster, in the sequence from highest to lowest similarity (i.e. as many as there are splits)
    
    for (i in 1:nrow(h)){ 
    # builds a sequence of variables highly correlated (at decreasing threshold);
    # the threshold for each level is put into "vec", while the sequence of 
    # correlated variables is put into "vecs"
    # Thus, if we choose a threshold value of k then only the list entries up to 
    # the entry vec>k has to be put into clusters.
    	if (all(h[i,]<0)){ 
      	 vecs[[i]] <- c(abs(h[i,]))	 
      	 vec[i] <- s[abs(h[i,1]), abs(h[i,2])]
      }
    	if (sum(h[i,]>0)==1){	 
    	   vec[i] <- min(s[abs(min(h[i,])), vecs[[max(h[i,])]]])
       	 vecs[[i]] <- c(vecs[[max(h[i,])]], abs(min(h[i,])))
      }
    	if (all(h[i,]>0)){		 
    	  vec[i] <- min(s[vecs[[h[i,1]]], vecs[[h[i,2]]]])
    	  vecs[[i]] <- c(vecs[[max(h[i,])]], vecs[[min(h[i,])]])
   	  }
    }
	 
	 clusters <- list(c())
	 
   # CFD added:
   if (sum(vec>threshold)==0) {return(names(X)); break}
   # CFD end
 	 for (i in 1:sum((vec>threshold)==T)){ # loops through all clusters with similarities above a pre-defined collinearity threshold (those below are uncorrelated)
	 # here the scan for the clusters starts:
	  	w <- vecs[[i]]
 	    for (j in 1:sum((vec>threshold)==T)){
       # check, whether there are more variables in the proposed clusters (vecs) than in the presently proposed (w)  AND check, whether those more variables are already contained in the proposed; if so, 
	  	 	if (length(vecs[[j]]) > length(w)) # AND
#  	 		if ((sum(w==vecs[[j]][1:length(w)])) == length(w))   #THEN
# here's a problem! If the sequence is not identical, this if-statement will not correctly identify the variables already present in the proposed cluster!
# here's a solution: use %in% instead:  	 		
  	 		if ((sum(w %in% vecs[[j]])) == length(w))   #THEN  	 		 	 		
				w <- vecs[[j]]
	    }
	    clusters[[i]] <- w 
   }
   clusters <- unique(clusters) # eliminates double entries
   out <- clusters
   for (i in 1:length(clusters)){
        # replace numbers by names
        out[[i]] <- colnames(X)[clusters[[i]]]
   }
	  # now we need to add all variables not included in the clusters:
	  uncor.vars <- as.list(colnames(X)[-unlist(clusters)])
	  if (length(uncor.vars)!=0){ out <- c(out, uncor.vars) }
  
    if (plot.clustertree){plot(v); abline((1-threshold),0,col='blue') }
	  
	  return(out) 
}