Clover icon

Coverage Report

  1. Project Clover database Thu Dec 4 2025 14:43:25 GMT
  2. Package jalview.datamodel

File HiddenMarkovModel.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
13% of files have more coverage

Code metrics

38
132
34
1
672
317
58
0.44
3.88
34
1.71

Classes

Class Line # Actions
HiddenMarkovModel 20 132 58
0.8921568489.2%
 

Contributing tests

This file is covered by 22 tests. .

Source view

1    package jalview.datamodel;
2   
3    import jalview.io.HMMFile;
4    import jalview.schemes.ResidueProperties;
5    import jalview.util.Comparison;
6    import jalview.util.MapList;
7   
8    import java.util.ArrayList;
9    import java.util.Arrays;
10    import java.util.HashMap;
11    import java.util.List;
12    import java.util.Map;
13   
14    /**
15    * Data structure which stores a hidden Markov model
16    *
17    * @author TZVanaalten
18    *
19    */
 
20    public class HiddenMarkovModel
21    {
22    private static final char GAP_DASH = '-';
23   
24    public final static String YES = "yes";
25   
26    public final static String NO = "no";
27   
28    public static final int MATCHTOMATCH = 0;
29   
30    public static final int MATCHTOINSERT = 1;
31   
32    public static final int MATCHTODELETE = 2;
33   
34    public static final int INSERTTOMATCH = 3;
35   
36    public static final int INSERTTOINSERT = 4;
37   
38    public static final int DELETETOMATCH = 5;
39   
40    public static final int DELETETODELETE = 6;
41   
42    private static final double LOG2 = Math.log(2);
43   
44    /*
45    * properties read from HMM file header lines
46    */
47    private Map<String, String> fileProperties = new HashMap<>();
48   
49    private String fileHeader;
50   
51    /*
52    * the symbols used in this model e.g. "ACGT"
53    */
54    private String alphabet;
55   
56    /*
57    * symbol lookup index into the alphabet for 'A' to 'Z'
58    */
59    private int[] symbolIndexLookup = new int['Z' - 'A' + 1];
60   
61    /*
62    * Nodes in the model. The begin node is at index 0, and contains
63    * average emission probabilities for each symbol.
64    */
65    private List<HMMNode> nodes = new ArrayList<>();
66   
67    /*
68    * the aligned HMM consensus sequence extracted from the HMM profile
69    */
70    private SequenceI hmmSeq;
71   
72    /*
73    * mapping from HMM nodes to residues of the hmm consensus sequence
74    */
75    private Mapping mapToHmmConsensus;
76   
77    // stores background frequencies of alignment from which this model came
78    private Map<Character, Float> backgroundFrequencies;
79   
80    /**
81    * Constructor
82    */
 
83  19 toggle public HiddenMarkovModel()
84    {
85    }
86   
87    /**
88    * Copy constructor given a new aligned sequence with which to associate the
89    * HMM profile
90    *
91    * @param hmm
92    * @param sq
93    */
 
94  1 toggle public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq)
95    {
96  1 super();
97  1 this.fileProperties = new HashMap<>(hmm.fileProperties);
98  1 this.alphabet = hmm.alphabet;
99  1 this.nodes = new ArrayList<>(hmm.nodes);
100  1 this.symbolIndexLookup = hmm.symbolIndexLookup;
101  1 this.fileHeader = new String(hmm.fileHeader);
102  1 this.hmmSeq = sq;
103  1 this.backgroundFrequencies = hmm.getBackgroundFrequencies();
104  1 if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo())
105    {
106    // same dataset sequence e.g. after realigning search results
107  0 this.mapToHmmConsensus = hmm.mapToHmmConsensus;
108    }
109    else
110    {
111    // different dataset sequence e.g. after loading HMM from project
112  1 this.mapToHmmConsensus = new Mapping(sq.getDatasetSequence(),
113    hmm.mapToHmmConsensus.getMap());
114    }
115    }
116   
117    /**
118    * Returns the information content at a specified column, calculated as the
119    * sum (over possible symbols) of the log ratio
120    *
121    * <pre>
122    * log(emission probability / background probability) / log(2)
123    * </pre>
124    *
125    * @param column
126    * column position (base 0)
127    * @return
128    */
 
129  470 toggle public float getInformationContent(int column)
130    {
131  470 float informationContent = 0f;
132   
133  470 for (char symbol : getSymbols().toCharArray())
134    {
135  9384 float freq = ResidueProperties.backgroundFrequencies
136    .get(getAlphabetType()).get(symbol);
137  9384 float prob = (float) getMatchEmissionProbability(column, symbol);
138  9384 informationContent += prob * Math.log(prob / freq);
139    }
140   
141  470 informationContent = informationContent / (float) LOG2;
142   
143  470 return informationContent;
144    }
145   
146    /**
147    * Gets the file header of the .hmm file this model came from
148    *
149    * @return
150    */
 
151  795 toggle public String getFileHeader()
152    {
153  795 return fileHeader;
154    }
155   
156    /**
157    * Sets the file header of this model.
158    *
159    * @param header
160    */
 
161  17 toggle public void setFileHeader(String header)
162    {
163  17 fileHeader = header;
164    }
165   
166    /**
167    * Returns the symbols used in this hidden Markov model
168    *
169    * @return
170    */
 
171  484 toggle public String getSymbols()
172    {
173  484 return alphabet;
174    }
175   
176    /**
177    * Gets the node in the hidden Markov model at the specified position.
178    *
179    * @param nodeIndex
180    * The index of the node requested. Node 0 optionally contains the
181    * average match emission probabilities across the entire model, and
182    * always contains the insert emission probabilities and state
183    * transition probabilities for the begin node. Node 1 contains the
184    * first node in the HMM that can correspond to a column in the
185    * alignment.
186    * @return
187    */
 
188  21851 toggle public HMMNode getNode(int nodeIndex)
189    {
190  21851 return nodes.get(nodeIndex);
191    }
192   
193    /**
194    * Returns the name of the sequence alignment on which the HMM is based.
195    *
196    * @return
197    */
 
198  20 toggle public String getName()
199    {
200  20 return fileProperties.get(HMMFile.NAME);
201    }
202   
203    /**
204    * Answers the string value of the property (parsed from an HMM file) for the
205    * given key, or null if the property is not present
206    *
207    * @param key
208    * @return
209    */
 
210  100 toggle public String getProperty(String key)
211    {
212  100 return fileProperties.get(key);
213    }
214   
215    /**
216    * Answers true if the property with the given key is present with a value of
217    * "yes" (not case-sensitive), else false
218    *
219    * @param key
220    * @return
221    */
 
222  57 toggle public boolean getBooleanProperty(String key)
223    {
224  57 return YES.equalsIgnoreCase(fileProperties.get(key));
225    }
226   
227    /**
228    * Returns the length of the hidden Markov model. The value returned is the
229    * LENG property if specified, else the number of nodes, excluding the begin
230    * node (which should be the same thing).
231    *
232    * @return
233    */
 
234  289 toggle public int getLength()
235    {
236  289 if (fileProperties.get(HMMFile.LENGTH) == null)
237    {
238  5 return nodes.size() - 1; // not counting BEGIN node
239    }
240  284 return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
241    }
242   
243    /**
244    * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
245    * the options. Other alphabets may be added.
246    *
247    * @return
248    */
 
249  18781 toggle public String getAlphabetType()
250    {
251  18781 return fileProperties.get(HMMFile.ALPHABET);
252    }
253   
254    /**
255    * Sets the model alphabet to the symbols in the given string (ignoring any
256    * whitespace), and returns the number of symbols
257    *
258    * @param symbols
259    */
 
260  33 toggle public int setAlphabet(String symbols)
261    {
262  33 String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
263  33 int count = trimmed.length();
264  33 alphabet = trimmed;
265  33 symbolIndexLookup = new int['Z' - 'A' + 1];
266  33 Arrays.fill(symbolIndexLookup, -1);
267  33 int ignored = 0;
268   
269    /*
270    * save the symbols in order, and a quick lookup of symbol position
271    */
272  580 for (short i = 0; i < count; i++)
273    {
274  547 char symbol = trimmed.charAt(i);
275  547 if (symbol >= 'A' && symbol <= 'Z'
276    && symbolIndexLookup[symbol - 'A'] == -1)
277    {
278  547 symbolIndexLookup[symbol - 'A'] = i;
279    }
280    else
281    {
282  0 System.err
283    .println(
284    "Unexpected or duplicated character in HMM ALPHabet: "
285    + symbol);
286  0 ignored++;
287    }
288    }
289  33 return count - ignored;
290    }
291   
292    /**
293    * Answers the node of the model corresponding to an aligned column position
294    * (0...), or null if there is no such node
295    *
296    * @param column
297    * @return
298    */
 
299  18856 toggle HMMNode getNodeForColumn(int column)
300    {
301    /*
302    * if the hmm consensus is gapped at the column,
303    * there is no corresponding node
304    */
305  18856 if (Comparison.isGap(hmmSeq.getCharAt(column)))
306    {
307  982 return null;
308    }
309   
310    /*
311    * find the node (if any) that is mapped to the
312    * consensus sequence residue position at the column
313    */
314  17874 int seqPos = hmmSeq.findPosition(column);
315  17874 int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos);
316  17874 if (nodeNo != null)
317    {
318  17874 return getNode(nodeNo[0]);
319    }
320  0 return null;
321    }
322   
323    /**
324    * Gets the match emission probability for a given symbol at a column in the
325    * alignment.
326    *
327    * @param alignColumn
328    * The index of the alignment column, starting at index 0. Index 0
329    * usually corresponds to index 1 in the HMM.
330    * @param symbol
331    * The symbol for which the desired probability is being requested.
332    * @return
333    *
334    */
 
335  18821 toggle public double getMatchEmissionProbability(int alignColumn, char symbol)
336    {
337  18821 HMMNode node = getNodeForColumn(alignColumn);
338  18821 int symbolIndex = getSymbolIndex(symbol);
339  18821 if (node != null && symbolIndex != -1)
340    {
341  17838 return node.getMatchEmission(symbolIndex);
342    }
343  983 return 0D;
344    }
345   
346    /**
347    * Gets the insert emission probability for a given symbol at a column in the
348    * alignment.
349    *
350    * @param alignColumn
351    * The index of the alignment column, starting at index 0. Index 0
352    * usually corresponds to index 1 in the HMM.
353    * @param symbol
354    * The symbol for which the desired probability is being requested.
355    * @return
356    *
357    */
 
358  17 toggle public double getInsertEmissionProbability(int alignColumn, char symbol)
359    {
360  17 HMMNode node = getNodeForColumn(alignColumn);
361  17 int symbolIndex = getSymbolIndex(symbol);
362  17 if (node != null && symbolIndex != -1)
363    {
364  16 return node.getInsertEmission(symbolIndex);
365    }
366  1 return 0D;
367    }
368   
369    /**
370    * Gets the state transition probability for a given symbol at a column in the
371    * alignment.
372    *
373    * @param alignColumn
374    * The index of the alignment column, starting at index 0. Index 0
375    * usually corresponds to index 1 in the HMM.
376    * @param symbol
377    * The symbol for which the desired probability is being requested.
378    * @return
379    *
380    */
 
381  18 toggle public double getStateTransitionProbability(int alignColumn,
382    int transition)
383    {
384  18 HMMNode node = getNodeForColumn(alignColumn);
385  18 if (node != null)
386    {
387  18 return node.getStateTransition(transition);
388    }
389  0 return 0D;
390    }
391   
392    /**
393    * Returns the sequence position linked to the node at the given index. This
394    * corresponds to an aligned column position (counting from 1).
395    *
396    * @param nodeIndex
397    * The index of the node, starting from index 1. Index 0 is the begin
398    * node, which does not correspond to a column in the alignment.
399    * @return
400    */
 
401  1311 toggle public int getNodeMapPosition(int nodeIndex)
402    {
403  1311 return nodes.get(nodeIndex).getResidueNumber();
404    }
405   
406    /**
407    * Returns the consensus residue at the specified node.
408    *
409    * @param nodeIndex
410    * The index of the specified node.
411    * @return
412    */
 
413  1311 toggle public char getConsensusResidue(int nodeIndex)
414    {
415  1311 char value = nodes.get(nodeIndex).getConsensusResidue();
416  1311 return value;
417    }
418   
419    /**
420    * Returns the reference annotation at the specified node.
421    *
422    * @param nodeIndex
423    * The index of the specified node.
424    * @return
425    */
 
426  1311 toggle public char getReferenceAnnotation(int nodeIndex)
427    {
428  1311 char value = nodes.get(nodeIndex).getReferenceAnnotation();
429  1311 return value;
430    }
431   
432    /**
433    * Returns the mask value at the specified node.
434    *
435    * @param nodeIndex
436    * The index of the specified node.
437    * @return
438    */
 
439  533 toggle public char getMaskedValue(int nodeIndex)
440    {
441  533 char value = nodes.get(nodeIndex).getMaskValue();
442  533 return value;
443    }
444   
445    /**
446    * Returns the consensus structure at the specified node.
447    *
448    * @param nodeIndex
449    * The index of the specified node.
450    * @return
451    */
 
452  533 toggle public char getConsensusStructure(int nodeIndex)
453    {
454  533 char value = nodes.get(nodeIndex).getConsensusStructure();
455  533 return value;
456    }
457   
458    /**
459    * Sets a property read from an HMM file
460    *
461    * @param key
462    * @param value
463    */
 
464  303 toggle public void setProperty(String key, String value)
465    {
466  303 fileProperties.put(key, value);
467    }
468   
469    /**
470    * Temporary implementation, should not be used.
471    *
472    * @return
473    */
 
474  7 toggle public String getViterbi()
475    {
476  7 String value;
477  7 value = fileProperties.get(HMMFile.VITERBI);
478  7 return value;
479    }
480   
481    /**
482    * Temporary implementation, should not be used.
483    *
484    * @return
485    */
 
486  12 toggle public String getMSV()
487    {
488  12 String value;
489  12 value = fileProperties.get(HMMFile.MSV);
490  12 return value;
491    }
492   
493    /**
494    * Temporary implementation, should not be used.
495    *
496    * @return
497    */
 
498  7 toggle public String getForward()
499    {
500  7 String value;
501  7 value = fileProperties.get(HMMFile.FORWARD);
502  7 return value;
503    }
504   
505    /**
506    * Constructs the consensus sequence based on the most probable symbol at each
507    * position. Gap characters are inserted for discontinuities in the node map
508    * numbering (if provided), else an ungapped sequence is generated.
509    * <p>
510    * A mapping between the HMM nodes and residue positions of the sequence is
511    * also built and saved.
512    *
513    * @return
514    */
 
515  17 toggle void buildConsensusSequence()
516    {
517  17 List<int[]> toResidues = new ArrayList<>();
518   
519    /*
520    * if the HMM provided a map to sequence, use those start/end values,
521    * else just treat it as for a contiguous sequence numbered from 1
522    */
523  17 boolean hasMap = getBooleanProperty(HMMFile.MAP);
524  17 int start = hasMap ? getNode(1).getResidueNumber() : 1;
525  17 int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber()
526    : (start + getLength() - 1);
527  17 char[] sequence = new char[endResNo];
528   
529  17 int lastResNo = start - 1;
530  17 int seqOffset = -1;
531  17 int gapCount = 0;
532   
533   
534  42 for (int seqN = 0; seqN < start; seqN++)
535    {
536  25 sequence[seqN] = GAP_DASH;
537  25 seqOffset++;
538    }
539   
540  2651 for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++)
541    {
542  2634 HMMNode node = nodes.get(nodeNo);
543  2634 final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1;
544   
545    /*
546    * insert gaps if map numbering is not continuous
547    */
548  7097 while (resNo > lastResNo + 1)
549    {
550  4463 sequence[seqOffset++] = GAP_DASH;
551  4463 lastResNo++;
552  4463 gapCount++;
553    }
554  2634 char consensusResidue = node.getConsensusResidue();
555  2634 if (GAP_DASH == consensusResidue)
556    {
557    /*
558    * no residue annotation in HMM - scan for the symbol
559    * with the highest match emission probability
560    */
561  2 int symbolIndex = node.getMaxMatchEmissionIndex();
562  2 consensusResidue = alphabet.charAt(symbolIndex);
563  2 if (node.getMatchEmission(symbolIndex) < 0.5D)
564    {
565    // follow convention of lower case if match emission prob < 0.5
566  2 consensusResidue = Character.toLowerCase(consensusResidue);
567    }
568    }
569  2634 sequence[seqOffset++] = consensusResidue;
570  2634 lastResNo = resNo;
571    }
572   
573  17 Sequence seq = new Sequence(getName(), sequence, start,
574    lastResNo - gapCount);
575  17 seq.createDatasetSequence();
576  17 seq.setHMM(this);
577  17 this.hmmSeq = seq;
578   
579    /*
580    * construct and store Mapping of nodes to residues
581    * note as constructed this is just an identity mapping,
582    * but it allows for greater flexibility in future
583    */
584  17 List<int[]> fromNodes = new ArrayList<>();
585  17 fromNodes.add(new int[] { 1, getLength() });
586  17 toResidues.add(new int[] { seq.getStart(), seq.getEnd() });
587  17 MapList mapList = new MapList(fromNodes, toResidues, 1, 1);
588  17 mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList);
589    }
590   
591   
592    /**
593    * Answers the aligned consensus sequence for the profile. Note this will
594    * return null if called before <code>setNodes</code> has been called.
595    *
596    * @return
597    */
 
598  9 toggle public SequenceI getConsensusSequence()
599    {
600  9 return hmmSeq;
601    }
602   
603    /**
604    * Answers the index position (0...) of the given symbol, or -1 if not a valid
605    * symbol for this HMM
606    *
607    * @param symbol
608    * @return
609    */
 
610  18838 toggle private int getSymbolIndex(char symbol)
611    {
612    /*
613    * symbolIndexLookup holds the index for 'A' to 'Z'
614    */
615  18838 char c = Character.toUpperCase(symbol);
616  18838 if ('A' <= c && c <= 'Z')
617    {
618  18838 return symbolIndexLookup[c - 'A'];
619    }
620  0 return -1;
621    }
622   
623    /**
624    * Sets the nodes of this HMM, and also extracts the HMM consensus sequence
625    * and a mapping between node numbers and sequence positions
626    *
627    * @param nodeList
628    */
 
629  17 toggle public void setNodes(List<HMMNode> nodeList)
630    {
631  17 nodes = nodeList;
632  17 if (nodes.size() > 1)
633    {
634  17 buildConsensusSequence();
635    }
636    }
637   
638    /**
639    * Sets the aligned consensus sequence this HMM is the model for
640    *
641    * @param hmmSeq
642    */
 
643  0 toggle public void setHmmSeq(SequenceI hmmSeq)
644    {
645  0 this.hmmSeq = hmmSeq;
646    }
647   
 
648  0 toggle public void setBackgroundFrequencies(Map<Character, Float> bkgdFreqs)
649    {
650  0 backgroundFrequencies = bkgdFreqs;
651    }
652   
 
653  0 toggle public void setBackgroundFrequencies(ResidueCount bkgdFreqs)
654    {
655  0 backgroundFrequencies = new HashMap<>();
656   
657  0 int total = bkgdFreqs.getTotalResidueCount();
658   
659  0 for (char c : bkgdFreqs.getSymbolCounts().symbols)
660    {
661  0 backgroundFrequencies.put(c, bkgdFreqs.getCount(c) * 1f / total);
662    }
663   
664    }
665   
 
666  4 toggle public Map<Character, Float> getBackgroundFrequencies()
667    {
668  4 return backgroundFrequencies;
669    }
670   
671    }
672