## Bastle Zutaten: observations <- c("1", "2", "3", "4", "5", "6") states <- c("Fair", "unfair") Fairprobs <- c(0.8, 0.2) Unfairprobs <- c(0.1, 0.9) transitionmatrix <- matrix(c(Fairprobs, Unfairprobs), 2,2,byrow=T) rownames(transitionmatrix) <- states colnames(transitionmatrix) <- states transitionmatrix Fairstate <- c(1/6, 1/6, 1/6, 1/6, 1/6, 1/6) Unfairstate <- c(0.1, 0.1, 0.1, 0.1, 0.1, 0.5) emissionmatrix <- matrix(c(Fairstate, Unfairstate), 2,6,byrow=T) rownames(emissionmatrix) <- states colnames(emissionmatrix) <- observations emissionmatrix hmmseq <- function(states, # Vektor der möglichen Zustände transitionmatrix, # Übergangsmatrix der Zustände emissionmatrix, # Emissionsmatrix inprobs, # Ausgangswahrscheinlichkeiten seqlength # Länge der generierten Sequenz(en) ){ observations <- observations states <- states myseq <- character() mystates <- character() # Auswählen des ersten Zustandes: firststate <- sample(states, 1, rep=T, prob=inprobs) probs <- emissionmatrix[firststate,] # Auswählen des ersten observations gegeben dem ersten State firstnuc <- sample(observations, 1, rep=T, prob=probs) myseq[1] <- firstnuc mystates[1] <- firststate for(i in 2:seqlength){ laststate <- mystates[i-1] stateprobs <- transitionmatrix[laststate,] state <- sample(states, 1, rep=T, prob=stateprobs) probs <- emissionmatrix[state,] nuc <- sample(observations, 1, rep=T, prob=probs) myseq[i] <- nuc mystates[i] <- state } return(data.frame(myseq, mystates)) } initialprobs <- c(0.5, 0.5) T <- 100 set.seed(42) seq <- hmmseq(states, transitionmatrix, emissionmatrix, initialprobs, T) seq nucsequence <- as.vector(seq[,1]) statesequence <- as.vector(seq[,2]) ## Viterbi #(cran.r-project.org/doc/contrib/Krijnen-IntroBioInfStatistics.pdf, S.209) viterbi <- function(sequence, transitionmatrix, emissionmatrix){ # Zustandsnamen: states <- rownames(emissionmatrix) #Viterbimatrix: v <- viterbimatrix(sequence, transitionmatrix, emissionmatrix) probablestatepath <- apply(v, 1, function(x) which.max(x)) # Ausgabe: lastnuc <- sequence[1] lastprobablestate <- probablestatepath[[1]] lastprobablestatename <- states[lastprobablestate] startpos <- 1 for(i in 2:length(sequence)){ nuc <- sequence[i] probablestate <- probablestatepath[[i]] probablestatename <- states[probablestate] if(probablestatename != lastprobablestatename){ print(paste("Position", startpos,"-", (i-1), "Wahrscheinlichster Zustand: ", lastprobablestatename)) startpos <- i } lastnuc <- nuc lastprobablestatename <- probablestatename } print(paste("Position", startpos, "-", i, "Wahrscheinlichster Zustand: ", lastprobablestatename)) } viterbimatrix <- function(sequence, transitionmatrix, emissionmatrix){ sequence <- sequence numstates <- dim(transitionmatrix)[1] v <- matrix(NA, nrow=length(sequence), ncol=dim(transitionmatrix)[1]) v[1,] <- 0 v[1,1] <- 1 for(i in 2:length(sequence)){ for(j in 1:numstates){ statejprobobservationi <- emissionmatrix[j, sequence[i]] v[i,j] <- statejprobobservationi * max(v[(i-1),]*transitionmatrix[,j]) } } return(v) } viterbi(nucsequence, transitionmatrix, emissionmatrix) ## Forward-Variable # Inputs: # - y: Beobachtungsvektor # - pi: "initial distribution" # - Q: Übergangsmatrix # - g: Emissionsmatrix # # Output: # phi := P(X_t = x |y_{1:t}) # c(t) := P(Y_t = y_t | Y_{1:t-1}=y_{1:t-1}) Forward <- function(y, pi, Q, g){ N <- length(y) dims <- dim(Q) result <- list() result$phi <- matrix(nrow=dims[1], ncol=N) result$c <- array(0,N) alpha <- pi*g[,y[1]] result$c[1] <- sum(alpha) result$phi[,1] <- alpha/result$c[1] for (t in 2:N){ H <- (result$phi[,t-1]%*%Q)*g[,y[t]] result$c[t] <- sum(H) result$phi[,t] <- H/result$c[t] } result } ## Backward-Variable # Inputs: # - y: Beobachtungsvektor # - Q: Übergangsmatrix # - g: Emissionsmatrix # - c: bedingte Likelihoods # # Outputs: # beta = P(y_{t+1:n} | X_t = x) / P(y_{t+1:n} | y_{1:t} Backward <- function(y, Q, g, c){ N <- length(y) dims <- dim(Q) beta <- matrix(1, nrow=dims[1], ncol=N) for(t in seq(N-1,1,-1)){ beta[,t]=Q%*%(g[,y[t+1]]*beta[,t+1])/c[t+1] } beta } ## Baum-Welch Algorithmus # Inputs: # - y: Beobachtungsvektor # - pi: Ausgangswahrscheinlichkeiten # - tol: Stoppkriterium # - maxiter: Erlaubte Anzahl an Iterationen # # Outputs: # Q: Schätzung für die Übergangsmatrix # g: Schätzung für die Emissionsmatrix # ll: log-likelihood von y für Q und g Baumwelch <- function(y, pi, tol=0.00001, maxiter = 1000){ N <- length(y) k <- length(pi) M <- max(y) Y <- matrix(0, nrow=N, ncol=M) for(t in 1:N){ Y[t, y[t]]=1 } Q <- matrix(runif(k*k), nrow=k) Q <- Q/apply(Q,1,sum) g <- matrix(runif(k*M), nrow=k) g <- g/apply(g,1,sum) iter <- 0 Q.alt <- Q-tol g.alt <- g-tol while ((sum(abs((Q.alt-Q))) + sum(abs(g.alt-g)) > tol) & (iter < maxiter)){ iter <- iter+1 forward <- Forward(y, pi, Q, g) backward <- Backward(y,Q,g,forward$c) posteriori <- forward$phi*backward L <- Q*(forward$phi[,1:(N-1)]%*%t(backward[,2:N]*g[,y[2:N]]/ (matrix(1,nrow=k,ncol=1)%*%forward$c[2:N]))) R <- posteriori%*%Y Q.alt <- Q g.alt <- g Q <- L/apply(L,1,sum) g <- R/apply(R,1,sum) } result <- list() result$Q <- Q result$g <- g result$ll <- sum(log(forward$c)) result$iter <- iter result } # Testen, nachdem Sequenz über HMMseq generiert wurde pi <- c(.5, .5) Baumwelch(as.numeric(nucsequence), pi, tol=0.000001, maxiter=10000)