Clover icon

Coverage Report

  1. Project Clover database Tue Mar 10 2026 14:58:44 GMT
  2. Package jalview.datamodel

File HiddenMarkovModel.java

 

Coverage histogram

../../img/srcFileCovDistChart0.png
0% of files have more coverage

Code metrics

38
132
34
1
672
317
58
0.44
3.88
34
1.71

Classes

Class Line # Actions
HiddenMarkovModel 20 132 58
0.00%
 

Contributing tests

No tests hitting this source file were found.

Source view

1    package jalview.datamodel;
2   
3    import jalview.io.HMMFile;
4    import jalview.schemes.ResidueProperties;
5    import jalview.util.Comparison;
6    import jalview.util.MapList;
7   
8    import java.util.ArrayList;
9    import java.util.Arrays;
10    import java.util.HashMap;
11    import java.util.List;
12    import java.util.Map;
13   
14    /**
15    * Data structure which stores a hidden Markov model
16    *
17    * @author TZVanaalten
18    *
19    */
 
20    public class HiddenMarkovModel
21    {
22    private static final char GAP_DASH = '-';
23   
24    public final static String YES = "yes";
25   
26    public final static String NO = "no";
27   
28    public static final int MATCHTOMATCH = 0;
29   
30    public static final int MATCHTOINSERT = 1;
31   
32    public static final int MATCHTODELETE = 2;
33   
34    public static final int INSERTTOMATCH = 3;
35   
36    public static final int INSERTTOINSERT = 4;
37   
38    public static final int DELETETOMATCH = 5;
39   
40    public static final int DELETETODELETE = 6;
41   
42    private static final double LOG2 = Math.log(2);
43   
44    /*
45    * properties read from HMM file header lines
46    */
47    private Map<String, String> fileProperties = new HashMap<>();
48   
49    private String fileHeader;
50   
51    /*
52    * the symbols used in this model e.g. "ACGT"
53    */
54    private String alphabet;
55   
56    /*
57    * symbol lookup index into the alphabet for 'A' to 'Z'
58    */
59    private int[] symbolIndexLookup = new int['Z' - 'A' + 1];
60   
61    /*
62    * Nodes in the model. The begin node is at index 0, and contains
63    * average emission probabilities for each symbol.
64    */
65    private List<HMMNode> nodes = new ArrayList<>();
66   
67    /*
68    * the aligned HMM consensus sequence extracted from the HMM profile
69    */
70    private SequenceI hmmSeq;
71   
72    /*
73    * mapping from HMM nodes to residues of the hmm consensus sequence
74    */
75    private Mapping mapToHmmConsensus;
76   
77    // stores background frequencies of alignment from which this model came
78    private Map<Character, Float> backgroundFrequencies;
79   
80    /**
81    * Constructor
82    */
 
83  0 toggle public HiddenMarkovModel()
84    {
85    }
86   
87    /**
88    * Copy constructor given a new aligned sequence with which to associate the
89    * HMM profile
90    *
91    * @param hmm
92    * @param sq
93    */
 
94  0 toggle public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq)
95    {
96  0 super();
97  0 this.fileProperties = new HashMap<>(hmm.fileProperties);
98  0 this.alphabet = hmm.alphabet;
99  0 this.nodes = new ArrayList<>(hmm.nodes);
100  0 this.symbolIndexLookup = hmm.symbolIndexLookup;
101  0 this.fileHeader = new String(hmm.fileHeader);
102  0 this.hmmSeq = sq;
103  0 this.backgroundFrequencies = hmm.getBackgroundFrequencies();
104  0 if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo())
105    {
106    // same dataset sequence e.g. after realigning search results
107  0 this.mapToHmmConsensus = hmm.mapToHmmConsensus;
108    }
109    else
110    {
111    // different dataset sequence e.g. after loading HMM from project
112  0 this.mapToHmmConsensus = new Mapping(sq.getDatasetSequence(),
113    hmm.mapToHmmConsensus.getMap());
114    }
115    }
116   
117    /**
118    * Returns the information content at a specified column, calculated as the
119    * sum (over possible symbols) of the log ratio
120    *
121    * <pre>
122    * log(emission probability / background probability) / log(2)
123    * </pre>
124    *
125    * @param column
126    * column position (base 0)
127    * @return
128    */
 
129  0 toggle public float getInformationContent(int column)
130    {
131  0 float informationContent = 0f;
132   
133  0 for (char symbol : getSymbols().toCharArray())
134    {
135  0 float freq = ResidueProperties.backgroundFrequencies
136    .get(getAlphabetType()).get(symbol);
137  0 float prob = (float) getMatchEmissionProbability(column, symbol);
138  0 informationContent += prob * Math.log(prob / freq);
139    }
140   
141  0 informationContent = informationContent / (float) LOG2;
142   
143  0 return informationContent;
144    }
145   
146    /**
147    * Gets the file header of the .hmm file this model came from
148    *
149    * @return
150    */
 
151  0 toggle public String getFileHeader()
152    {
153  0 return fileHeader;
154    }
155   
156    /**
157    * Sets the file header of this model.
158    *
159    * @param header
160    */
 
161  0 toggle public void setFileHeader(String header)
162    {
163  0 fileHeader = header;
164    }
165   
166    /**
167    * Returns the symbols used in this hidden Markov model
168    *
169    * @return
170    */
 
171  0 toggle public String getSymbols()
172    {
173  0 return alphabet;
174    }
175   
176    /**
177    * Gets the node in the hidden Markov model at the specified position.
178    *
179    * @param nodeIndex
180    * The index of the node requested. Node 0 optionally contains the
181    * average match emission probabilities across the entire model, and
182    * always contains the insert emission probabilities and state
183    * transition probabilities for the begin node. Node 1 contains the
184    * first node in the HMM that can correspond to a column in the
185    * alignment.
186    * @return
187    */
 
188  0 toggle public HMMNode getNode(int nodeIndex)
189    {
190  0 return nodes.get(nodeIndex);
191    }
192   
193    /**
194    * Returns the name of the sequence alignment on which the HMM is based.
195    *
196    * @return
197    */
 
198  0 toggle public String getName()
199    {
200  0 return fileProperties.get(HMMFile.NAME);
201    }
202   
203    /**
204    * Answers the string value of the property (parsed from an HMM file) for the
205    * given key, or null if the property is not present
206    *
207    * @param key
208    * @return
209    */
 
210  0 toggle public String getProperty(String key)
211    {
212  0 return fileProperties.get(key);
213    }
214   
215    /**
216    * Answers true if the property with the given key is present with a value of
217    * "yes" (not case-sensitive), else false
218    *
219    * @param key
220    * @return
221    */
 
222  0 toggle public boolean getBooleanProperty(String key)
223    {
224  0 return YES.equalsIgnoreCase(fileProperties.get(key));
225    }
226   
227    /**
228    * Returns the length of the hidden Markov model. The value returned is the
229    * LENG property if specified, else the number of nodes, excluding the begin
230    * node (which should be the same thing).
231    *
232    * @return
233    */
 
234  0 toggle public int getLength()
235    {
236  0 if (fileProperties.get(HMMFile.LENGTH) == null)
237    {
238  0 return nodes.size() - 1; // not counting BEGIN node
239    }
240  0 return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
241    }
242   
243    /**
244    * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
245    * the options. Other alphabets may be added.
246    *
247    * @return
248    */
 
249  0 toggle public String getAlphabetType()
250    {
251  0 return fileProperties.get(HMMFile.ALPHABET);
252    }
253   
254    /**
255    * Sets the model alphabet to the symbols in the given string (ignoring any
256    * whitespace), and returns the number of symbols
257    *
258    * @param symbols
259    */
 
260  0 toggle public int setAlphabet(String symbols)
261    {
262  0 String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
263  0 int count = trimmed.length();
264  0 alphabet = trimmed;
265  0 symbolIndexLookup = new int['Z' - 'A' + 1];
266  0 Arrays.fill(symbolIndexLookup, -1);
267  0 int ignored = 0;
268   
269    /*
270    * save the symbols in order, and a quick lookup of symbol position
271    */
272  0 for (short i = 0; i < count; i++)
273    {
274  0 char symbol = trimmed.charAt(i);
275  0 if (symbol >= 'A' && symbol <= 'Z'
276    && symbolIndexLookup[symbol - 'A'] == -1)
277    {
278  0 symbolIndexLookup[symbol - 'A'] = i;
279    }
280    else
281    {
282  0 System.err
283    .println(
284    "Unexpected or duplicated character in HMM ALPHabet: "
285    + symbol);
286  0 ignored++;
287    }
288    }
289  0 return count - ignored;
290    }
291   
292    /**
293    * Answers the node of the model corresponding to an aligned column position
294    * (0...), or null if there is no such node
295    *
296    * @param column
297    * @return
298    */
 
299  0 toggle HMMNode getNodeForColumn(int column)
300    {
301    /*
302    * if the hmm consensus is gapped at the column,
303    * there is no corresponding node
304    */
305  0 if (Comparison.isGap(hmmSeq.getCharAt(column)))
306    {
307  0 return null;
308    }
309   
310    /*
311    * find the node (if any) that is mapped to the
312    * consensus sequence residue position at the column
313    */
314  0 int seqPos = hmmSeq.findPosition(column);
315  0 int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos);
316  0 if (nodeNo != null)
317    {
318  0 return getNode(nodeNo[0]);
319    }
320  0 return null;
321    }
322   
323    /**
324    * Gets the match emission probability for a given symbol at a column in the
325    * alignment.
326    *
327    * @param alignColumn
328    * The index of the alignment column, starting at index 0. Index 0
329    * usually corresponds to index 1 in the HMM.
330    * @param symbol
331    * The symbol for which the desired probability is being requested.
332    * @return
333    *
334    */
 
335  0 toggle public double getMatchEmissionProbability(int alignColumn, char symbol)
336    {
337  0 HMMNode node = getNodeForColumn(alignColumn);
338  0 int symbolIndex = getSymbolIndex(symbol);
339  0 if (node != null && symbolIndex != -1)
340    {
341  0 return node.getMatchEmission(symbolIndex);
342    }
343  0 return 0D;
344    }
345   
346    /**
347    * Gets the insert emission probability for a given symbol at a column in the
348    * alignment.
349    *
350    * @param alignColumn
351    * The index of the alignment column, starting at index 0. Index 0
352    * usually corresponds to index 1 in the HMM.
353    * @param symbol
354    * The symbol for which the desired probability is being requested.
355    * @return
356    *
357    */
 
358  0 toggle public double getInsertEmissionProbability(int alignColumn, char symbol)
359    {
360  0 HMMNode node = getNodeForColumn(alignColumn);
361  0 int symbolIndex = getSymbolIndex(symbol);
362  0 if (node != null && symbolIndex != -1)
363    {
364  0 return node.getInsertEmission(symbolIndex);
365    }
366  0 return 0D;
367    }
368   
369    /**
370    * Gets the state transition probability for a given symbol at a column in the
371    * alignment.
372    *
373    * @param alignColumn
374    * The index of the alignment column, starting at index 0. Index 0
375    * usually corresponds to index 1 in the HMM.
376    * @param symbol
377    * The symbol for which the desired probability is being requested.
378    * @return
379    *
380    */
 
381  0 toggle public double getStateTransitionProbability(int alignColumn,
382    int transition)
383    {
384  0 HMMNode node = getNodeForColumn(alignColumn);
385  0 if (node != null)
386    {
387  0 return node.getStateTransition(transition);
388    }
389  0 return 0D;
390    }
391   
392    /**
393    * Returns the sequence position linked to the node at the given index. This
394    * corresponds to an aligned column position (counting from 1).
395    *
396    * @param nodeIndex
397    * The index of the node, starting from index 1. Index 0 is the begin
398    * node, which does not correspond to a column in the alignment.
399    * @return
400    */
 
401  0 toggle public int getNodeMapPosition(int nodeIndex)
402    {
403  0 return nodes.get(nodeIndex).getResidueNumber();
404    }
405   
406    /**
407    * Returns the consensus residue at the specified node.
408    *
409    * @param nodeIndex
410    * The index of the specified node.
411    * @return
412    */
 
413  0 toggle public char getConsensusResidue(int nodeIndex)
414    {
415  0 char value = nodes.get(nodeIndex).getConsensusResidue();
416  0 return value;
417    }
418   
419    /**
420    * Returns the reference annotation at the specified node.
421    *
422    * @param nodeIndex
423    * The index of the specified node.
424    * @return
425    */
 
426  0 toggle public char getReferenceAnnotation(int nodeIndex)
427    {
428  0 char value = nodes.get(nodeIndex).getReferenceAnnotation();
429  0 return value;
430    }
431   
432    /**
433    * Returns the mask value at the specified node.
434    *
435    * @param nodeIndex
436    * The index of the specified node.
437    * @return
438    */
 
439  0 toggle public char getMaskedValue(int nodeIndex)
440    {
441  0 char value = nodes.get(nodeIndex).getMaskValue();
442  0 return value;
443    }
444   
445    /**
446    * Returns the consensus structure at the specified node.
447    *
448    * @param nodeIndex
449    * The index of the specified node.
450    * @return
451    */
 
452  0 toggle public char getConsensusStructure(int nodeIndex)
453    {
454  0 char value = nodes.get(nodeIndex).getConsensusStructure();
455  0 return value;
456    }
457   
458    /**
459    * Sets a property read from an HMM file
460    *
461    * @param key
462    * @param value
463    */
 
464  0 toggle public void setProperty(String key, String value)
465    {
466  0 fileProperties.put(key, value);
467    }
468   
469    /**
470    * Temporary implementation, should not be used.
471    *
472    * @return
473    */
 
474  0 toggle public String getViterbi()
475    {
476  0 String value;
477  0 value = fileProperties.get(HMMFile.VITERBI);
478  0 return value;
479    }
480   
481    /**
482    * Temporary implementation, should not be used.
483    *
484    * @return
485    */
 
486  0 toggle public String getMSV()
487    {
488  0 String value;
489  0 value = fileProperties.get(HMMFile.MSV);
490  0 return value;
491    }
492   
493    /**
494    * Temporary implementation, should not be used.
495    *
496    * @return
497    */
 
498  0 toggle public String getForward()
499    {
500  0 String value;
501  0 value = fileProperties.get(HMMFile.FORWARD);
502  0 return value;
503    }
504   
505    /**
506    * Constructs the consensus sequence based on the most probable symbol at each
507    * position. Gap characters are inserted for discontinuities in the node map
508    * numbering (if provided), else an ungapped sequence is generated.
509    * <p>
510    * A mapping between the HMM nodes and residue positions of the sequence is
511    * also built and saved.
512    *
513    * @return
514    */
 
515  0 toggle void buildConsensusSequence()
516    {
517  0 List<int[]> toResidues = new ArrayList<>();
518   
519    /*
520    * if the HMM provided a map to sequence, use those start/end values,
521    * else just treat it as for a contiguous sequence numbered from 1
522    */
523  0 boolean hasMap = getBooleanProperty(HMMFile.MAP);
524  0 int start = hasMap ? getNode(1).getResidueNumber() : 1;
525  0 int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber()
526    : (start + getLength() - 1);
527  0 char[] sequence = new char[endResNo];
528   
529  0 int lastResNo = start - 1;
530  0 int seqOffset = -1;
531  0 int gapCount = 0;
532   
533   
534  0 for (int seqN = 0; seqN < start; seqN++)
535    {
536  0 sequence[seqN] = GAP_DASH;
537  0 seqOffset++;
538    }
539   
540  0 for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++)
541    {
542  0 HMMNode node = nodes.get(nodeNo);
543  0 final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1;
544   
545    /*
546    * insert gaps if map numbering is not continuous
547    */
548  0 while (resNo > lastResNo + 1)
549    {
550  0 sequence[seqOffset++] = GAP_DASH;
551  0 lastResNo++;
552  0 gapCount++;
553    }
554  0 char consensusResidue = node.getConsensusResidue();
555  0 if (GAP_DASH == consensusResidue)
556    {
557    /*
558    * no residue annotation in HMM - scan for the symbol
559    * with the highest match emission probability
560    */
561  0 int symbolIndex = node.getMaxMatchEmissionIndex();
562  0 consensusResidue = alphabet.charAt(symbolIndex);
563  0 if (node.getMatchEmission(symbolIndex) < 0.5D)
564    {
565    // follow convention of lower case if match emission prob < 0.5
566  0 consensusResidue = Character.toLowerCase(consensusResidue);
567    }
568    }
569  0 sequence[seqOffset++] = consensusResidue;
570  0 lastResNo = resNo;
571    }
572   
573  0 Sequence seq = new Sequence(getName(), sequence, start,
574    lastResNo - gapCount);
575  0 seq.createDatasetSequence();
576  0 seq.setHMM(this);
577  0 this.hmmSeq = seq;
578   
579    /*
580    * construct and store Mapping of nodes to residues
581    * note as constructed this is just an identity mapping,
582    * but it allows for greater flexibility in future
583    */
584  0 List<int[]> fromNodes = new ArrayList<>();
585  0 fromNodes.add(new int[] { 1, getLength() });
586  0 toResidues.add(new int[] { seq.getStart(), seq.getEnd() });
587  0 MapList mapList = new MapList(fromNodes, toResidues, 1, 1);
588  0 mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList);
589    }
590   
591   
592    /**
593    * Answers the aligned consensus sequence for the profile. Note this will
594    * return null if called before <code>setNodes</code> has been called.
595    *
596    * @return
597    */
 
598  0 toggle public SequenceI getConsensusSequence()
599    {
600  0 return hmmSeq;
601    }
602   
603    /**
604    * Answers the index position (0...) of the given symbol, or -1 if not a valid
605    * symbol for this HMM
606    *
607    * @param symbol
608    * @return
609    */
 
610  0 toggle private int getSymbolIndex(char symbol)
611    {
612    /*
613    * symbolIndexLookup holds the index for 'A' to 'Z'
614    */
615  0 char c = Character.toUpperCase(symbol);
616  0 if ('A' <= c && c <= 'Z')
617    {
618  0 return symbolIndexLookup[c - 'A'];
619    }
620  0 return -1;
621    }
622   
623    /**
624    * Sets the nodes of this HMM, and also extracts the HMM consensus sequence
625    * and a mapping between node numbers and sequence positions
626    *
627    * @param nodeList
628    */
 
629  0 toggle public void setNodes(List<HMMNode> nodeList)
630    {
631  0 nodes = nodeList;
632  0 if (nodes.size() > 1)
633    {
634  0 buildConsensusSequence();
635    }
636    }
637   
638    /**
639    * Sets the aligned consensus sequence this HMM is the model for
640    *
641    * @param hmmSeq
642    */
 
643  0 toggle public void setHmmSeq(SequenceI hmmSeq)
644    {
645  0 this.hmmSeq = hmmSeq;
646    }
647   
 
648  0 toggle public void setBackgroundFrequencies(Map<Character, Float> bkgdFreqs)
649    {
650  0 backgroundFrequencies = bkgdFreqs;
651    }
652   
 
653  0 toggle public void setBackgroundFrequencies(ResidueCount bkgdFreqs)
654    {
655  0 backgroundFrequencies = new HashMap<>();
656   
657  0 int total = bkgdFreqs.getTotalResidueCount();
658   
659  0 for (char c : bkgdFreqs.getSymbolCounts().symbols)
660    {
661  0 backgroundFrequencies.put(c, bkgdFreqs.getCount(c) * 1f / total);
662    }
663   
664    }
665   
 
666  0 toggle public Map<Character, Float> getBackgroundFrequencies()
667    {
668  0 return backgroundFrequencies;
669    }
670   
671    }
672