Clover icon

Coverage Report

  1. Project Clover database Thu Aug 13 2020 12:04:21 BST
  2. Package jalview.io

File StockholmFile.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
20% of files have more coverage

Code metrics

248
478
17
1
1,303
940
176
0.37
28.12
17
10.35

Classes

Class Line # Actions
StockholmFile 76 478 176
0.75639375.6%
 

Contributing tests

This file is covered by 13 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    /*
22    * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk
23    */
24    package jalview.io;
25   
26    import java.io.BufferedReader;
27    import java.io.FileReader;
28    import java.io.IOException;
29    import java.util.ArrayList;
30    import java.util.Enumeration;
31    import java.util.Hashtable;
32    import java.util.LinkedHashMap;
33    import java.util.List;
34    import java.util.Map;
35    import java.util.Vector;
36   
37    import com.stevesoft.pat.Regex;
38   
39    import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses;
40    import fr.orsay.lri.varna.factories.RNAFactory;
41    import fr.orsay.lri.varna.models.rna.RNA;
42    import jalview.analysis.Rna;
43    import jalview.datamodel.AlignmentAnnotation;
44    import jalview.datamodel.AlignmentI;
45    import jalview.datamodel.Annotation;
46    import jalview.datamodel.DBRefEntry;
47    import jalview.datamodel.DBRefSource;
48    import jalview.datamodel.Mapping;
49    import jalview.datamodel.Sequence;
50    import jalview.datamodel.SequenceFeature;
51    import jalview.datamodel.SequenceI;
52    import jalview.schemes.ResidueProperties;
53    import jalview.util.Comparison;
54    import jalview.util.DBRefUtils;
55    import jalview.util.Format;
56    import jalview.util.MessageManager;
57   
58    // import org.apache.log4j.*;
59   
60    /**
61    * This class is supposed to parse a Stockholm format file into Jalview There
62    * are TODOs in this class: we do not know what the database source and version
63    * is for the file when parsing the #GS= AC tag which associates accessions with
64    * sequences. Database references are also not parsed correctly: a separate
65    * reference string parser must be added to parse the database reference form
66    * into Jalview's local representation.
67    *
68    * @author bsb at sanger.ac.uk
69    * @author Natasha Shersnev (Dundee, UK) (Stockholm file writer)
70    * @author Lauren Lui (UCSC, USA) (RNA secondary structure annotation import as
71    * stockholm)
72    * @author Anne Menard (Paris, FR) (VARNA parsing of Stockholm file data)
73    * @version 0.3 + jalview mods
74    *
75    */
 
76    public class StockholmFile extends AlignFile
77    {
78    private static final String ANNOTATION = "annotation";
79   
80    // private static final Regex OPEN_PAREN = new Regex("(<|\\[)", "(");
81    //
82    // private static final Regex CLOSE_PAREN = new Regex("(>|\\])", ")");
83   
84    public static final Regex DETECT_BRACKETS = new Regex(
85    "(<|>|\\[|\\]|\\(|\\)|\\{|\\})");
86   
87    // WUSS extended symbols. Avoid ambiguity with protein SS annotations by using NOT_RNASS first.
88    public static final String RNASS_BRACKETS = "<>[](){}AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";
89   
90    // use the following regex to decide an annotations (whole) line is NOT an RNA
91    // SS (it contains only E,H,e,h and other non-brace/non-alpha chars)
92    private static final Regex NOT_RNASS = new Regex(
93    "^[^<>[\\](){}A-DF-Za-df-z]*$");
94   
95    StringBuffer out; // output buffer
96   
97    AlignmentI al;
98   
 
99  0 toggle public StockholmFile()
100    {
101    }
102   
103    /**
104    * Creates a new StockholmFile object for output.
105    */
 
106  11 toggle public StockholmFile(AlignmentI al)
107    {
108  11 this.al = al;
109    }
110   
 
111  0 toggle public StockholmFile(String inFile, DataSourceType type)
112    throws IOException
113    {
114  0 super(inFile, type);
115    }
116   
 
117  42 toggle public StockholmFile(FileParse source) throws IOException
118    {
119  42 super(source);
120    }
121   
 
122  53 toggle @Override
123    public void initData()
124    {
125  53 super.initData();
126    }
127   
128    /**
129    * Parse a file in Stockholm format into Jalview's data model using VARNA
130    *
131    * @throws IOException
132    * If there is an error with the input file
133    */
 
134  0 toggle public void parse_with_VARNA(java.io.File inFile) throws IOException
135    {
136  0 FileReader fr = null;
137  0 fr = new FileReader(inFile);
138   
139  0 BufferedReader r = new BufferedReader(fr);
140  0 List<RNA> result = null;
141  0 try
142    {
143  0 result = RNAFactory.loadSecStrStockholm(r);
144    } catch (ExceptionUnmatchedClosingParentheses umcp)
145    {
146  0 errormessage = "Unmatched parentheses in annotation. Aborting ("
147    + umcp.getMessage() + ")";
148  0 throw new IOException(umcp);
149    }
150    // DEBUG System.out.println("this is the secondary scructure:"
151    // +result.size());
152  0 SequenceI[] seqs = new SequenceI[result.size()];
153  0 String id = null;
154  0 for (int i = 0; i < result.size(); i++)
155    {
156    // DEBUG System.err.println("Processing i'th sequence in Stockholm file")
157  0 RNA current = result.get(i);
158   
159  0 String seq = current.getSeq();
160  0 String rna = current.getStructDBN(true);
161    // DEBUG System.out.println(seq);
162    // DEBUG System.err.println(rna);
163  0 int begin = 0;
164  0 int end = seq.length() - 1;
165  0 id = safeName(getDataName());
166  0 seqs[i] = new Sequence(id, seq, begin, end);
167  0 String[] annot = new String[rna.length()];
168  0 Annotation[] ann = new Annotation[rna.length()];
169  0 for (int j = 0; j < rna.length(); j++)
170    {
171  0 annot[j] = rna.substring(j, j + 1);
172   
173    }
174   
175  0 for (int k = 0; k < rna.length(); k++)
176    {
177  0 ann[k] = new Annotation(annot[k], "",
178    Rna.getRNASecStrucState(annot[k]).charAt(0), 0f);
179   
180    }
181  0 AlignmentAnnotation align = new AlignmentAnnotation("Sec. str.",
182    current.getID(), ann);
183   
184  0 seqs[i].addAlignmentAnnotation(align);
185  0 seqs[i].setRNA(result.get(i));
186  0 this.annotations.addElement(align);
187    }
188  0 this.setSeqs(seqs);
189   
190    }
191   
192    /**
193    * Parse a file in Stockholm format into Jalview's data model. The file has to
194    * be passed at construction time
195    *
196    * @throws IOException
197    * If there is an error with the input file
198    */
 
199  42 toggle @Override
200    public void parse() throws IOException
201    {
202  42 StringBuffer treeString = new StringBuffer();
203  42 String treeName = null;
204    // --------------- Variable Definitions -------------------
205  42 String line;
206  42 String version;
207    // String id;
208  42 Hashtable seqAnn = new Hashtable(); // Sequence related annotations
209  42 LinkedHashMap<String, String> seqs = new LinkedHashMap<>();
210  42 Regex p, r, rend, s, x;
211    // Temporary line for processing RNA annotation
212    // String RNAannot = "";
213   
214    // ------------------ Parsing File ----------------------
215    // First, we have to check that this file has STOCKHOLM format, i.e. the
216    // first line must match
217   
218  42 r = new Regex("# STOCKHOLM ([\\d\\.]+)");
219  42 if (!r.search(nextLine()))
220    {
221  0 throw new IOException(MessageManager
222    .getString("exception.stockholm_invalid_format"));
223    }
224    else
225    {
226  42 version = r.stringMatched(1);
227   
228    // logger.debug("Stockholm version: " + version);
229    }
230   
231    // We define some Regexes here that will be used regularily later
232  42 rend = new Regex("^\\s*\\/\\/"); // Find the end of an alignment
233  42 p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in
234    // id/from/to
235  42 s = new Regex("(\\S+)\\s+(\\S*)\\s+(.*)"); // Parses annotation subtype
236  42 r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line
237  42 x = new Regex("(\\S+)\\s+(\\S+)"); // split id from sequence
238   
239    // Convert all bracket types to parentheses (necessary for passing to VARNA)
240  42 Regex openparen = new Regex("(<|\\[)", "(");
241  42 Regex closeparen = new Regex("(>|\\])", ")");
242   
243    // // Detect if file is RNA by looking for bracket types
244    // Regex detectbrackets = new Regex("(<|>|\\[|\\]|\\(|\\))");
245   
246  42 rend.optimize();
247  42 p.optimize();
248  42 s.optimize();
249  42 r.optimize();
250  42 x.optimize();
251  42 openparen.optimize();
252  42 closeparen.optimize();
253   
254  ? while ((line = nextLine()) != null)
255    {
256  2474 if (line.length() == 0)
257    {
258  6 continue;
259    }
260  2468 if (rend.search(line))
261    {
262    // End of the alignment, pass stuff back
263  42 this.noSeqs = seqs.size();
264   
265  42 String dbsource = null;
266  42 Regex pf = new Regex("PF[0-9]{5}(.*)"); // Finds AC for Pfam
267  42 Regex rf = new Regex("RF[0-9]{5}(.*)"); // Finds AC for Rfam
268  42 if (getAlignmentProperty("AC") != null)
269    {
270  6 String dbType = getAlignmentProperty("AC").toString();
271  6 if (pf.search(dbType))
272    {
273    // PFAM Alignment - so references are typically from Uniprot
274  3 dbsource = "PFAM";
275    }
276  3 else if (rf.search(dbType))
277    {
278  3 dbsource = "RFAM";
279    }
280    }
281    // logger.debug("Number of sequences: " + this.noSeqs);
282  42 for (Map.Entry<String, String> skey : seqs.entrySet())
283    {
284    // logger.debug("Processing sequence " + acc);
285  910 String acc = skey.getKey();
286  910 String seq = skey.getValue();
287  910 if (maxLength < seq.length())
288    {
289  42 maxLength = seq.length();
290    }
291  910 int start = 1;
292  910 int end = -1;
293  910 String sid = acc;
294    /*
295    * Retrieve hash of annotations for this accession Associate
296    * Annotation with accession
297    */
298  910 Hashtable accAnnotations = null;
299   
300  910 if (seqAnn != null && seqAnn.containsKey(acc))
301    {
302  894 accAnnotations = (Hashtable) seqAnn.remove(acc);
303    // TODO: add structures to sequence
304    }
305   
306    // Split accession in id and from/to
307  910 if (p.search(acc))
308    {
309  684 sid = p.stringMatched(1);
310  684 start = Integer.parseInt(p.stringMatched(2));
311  684 end = Integer.parseInt(p.stringMatched(3));
312    }
313    // logger.debug(sid + ", " + start + ", " + end);
314   
315  910 Sequence seqO = new Sequence(sid, seq, start, end);
316    // Add Description (if any)
317  910 if (accAnnotations != null && accAnnotations.containsKey("DE"))
318    {
319  0 String desc = (String) accAnnotations.get("DE");
320  0 seqO.setDescription((desc == null) ? "" : desc);
321    }
322    // Add DB References (if any)
323  910 if (accAnnotations != null && accAnnotations.containsKey("DR"))
324    {
325  26 String dbr = (String) accAnnotations.get("DR");
326  26 if (dbr != null && dbr.indexOf(";") > -1)
327    {
328  26 String src = dbr.substring(0, dbr.indexOf(";"));
329  26 String acn = dbr.substring(dbr.indexOf(";") + 1);
330  26 jalview.util.DBRefUtils.parseToDbRef(seqO, src, "0", acn);
331    }
332    }
333   
334  910 if (accAnnotations != null && accAnnotations.containsKey("AC"))
335    {
336  889 String dbr = (String) accAnnotations.get("AC");
337  889 if (dbr != null)
338    {
339    // we could get very clever here - but for now - just try to
340    // guess accession type from type of sequence, source of alignment plus
341    // structure
342    // of accession
343  889 guessDatabaseFor(seqO, dbr, dbsource);
344    }
345    // else - do what ? add the data anyway and prompt the user to
346    // specify what references these are ?
347    }
348   
349  910 Hashtable features = null;
350    // We need to adjust the positions of all features to account for gaps
351  910 try
352    {
353  910 features = (Hashtable) accAnnotations.remove("features");
354    } catch (java.lang.NullPointerException e)
355    {
356    // loggerwarn("Getting Features for " + acc + ": " +
357    // e.getMessage());
358    // continue;
359    }
360    // if we have features
361  910 if (features != null)
362    {
363  314 int posmap[] = seqO.findPositionMap();
364  314 Enumeration i = features.keys();
365  628 while (i.hasMoreElements())
366    {
367    // TODO: parse out secondary structure annotation as annotation
368    // row
369    // TODO: parse out scores as annotation row
370    // TODO: map coding region to core jalview feature types
371  314 String type = i.nextElement().toString();
372  314 Hashtable content = (Hashtable) features.remove(type);
373   
374    // add alignment annotation for this feature
375  314 String key = type2id(type);
376   
377    /*
378    * have we added annotation rows for this type ?
379    */
380  314 boolean annotsAdded = false;
381  314 if (key != null)
382    {
383  314 if (accAnnotations != null
384    && accAnnotations.containsKey(key))
385    {
386  314 Vector vv = (Vector) accAnnotations.get(key);
387  628 for (int ii = 0; ii < vv.size(); ii++)
388    {
389  314 annotsAdded = true;
390  314 AlignmentAnnotation an = (AlignmentAnnotation) vv
391    .elementAt(ii);
392  314 seqO.addAlignmentAnnotation(an);
393  314 annotations.add(an);
394    }
395    }
396    }
397   
398  314 Enumeration j = content.keys();
399  628 while (j.hasMoreElements())
400    {
401  314 String desc = j.nextElement().toString();
402  314 if (ANNOTATION.equals(desc) && annotsAdded)
403    {
404    // don't add features if we already added an annotation row
405  314 continue;
406    }
407  0 String ns = content.get(desc).toString();
408  0 char[] byChar = ns.toCharArray();
409  0 for (int k = 0; k < byChar.length; k++)
410    {
411  0 char c = byChar[k];
412  0 if (!(c == ' ' || c == '_' || c == '-' || c == '.')) // PFAM
413    // uses
414    // '.'
415    // for
416    // feature
417    // background
418    {
419  0 int new_pos = posmap[k]; // look up nearest seqeunce
420    // position to this column
421  0 SequenceFeature feat = new SequenceFeature(type, desc,
422    new_pos, new_pos, null);
423   
424  0 seqO.addSequenceFeature(feat);
425    }
426    }
427    }
428   
429    }
430   
431    }
432    // garbage collect
433   
434    // logger.debug("Adding seq " + acc + " from " + start + " to " + end
435    // + ": " + seq);
436  910 this.seqs.addElement(seqO);
437    }
438  42 return; // finished parsing this segment of source
439    }
440  2426 else if (!r.search(line))
441    {
442    // System.err.println("Found sequence line: " + line);
443   
444    // Split sequence in sequence and accession parts
445  910 if (!x.search(line))
446    {
447    // logger.error("Could not parse sequence line: " + line);
448  0 throw new IOException(MessageManager.formatMessage(
449    "exception.couldnt_parse_sequence_line", new String[]
450    { line }));
451    }
452  910 String ns = seqs.get(x.stringMatched(1));
453  910 if (ns == null)
454    {
455  910 ns = "";
456    }
457  910 ns += x.stringMatched(2);
458   
459  910 seqs.put(x.stringMatched(1), ns);
460    }
461    else
462    {
463  1516 String annType = r.stringMatched(1);
464  1516 String annContent = r.stringMatched(2);
465   
466    // System.err.println("type:" + annType + " content: " + annContent);
467   
468  1516 if (annType.equals("GF"))
469    {
470    /*
471    * Generic per-File annotation, free text Magic features: #=GF NH
472    * <tree in New Hampshire eXtended format> #=GF TN <Unique identifier
473    * for the next tree> Pfam descriptions: 7. DESCRIPTION OF FIELDS
474    *
475    * Compulsory fields: ------------------
476    *
477    * AC Accession number: Accession number in form PFxxxxx.version or
478    * PBxxxxxx. ID Identification: One word name for family. DE
479    * Definition: Short description of family. AU Author: Authors of the
480    * entry. SE Source of seed: The source suggesting the seed members
481    * belong to one family. GA Gathering method: Search threshold to
482    * build the full alignment. TC Trusted Cutoff: Lowest sequence score
483    * and domain score of match in the full alignment. NC Noise Cutoff:
484    * Highest sequence score and domain score of match not in full
485    * alignment. TP Type: Type of family -- presently Family, Domain,
486    * Motif or Repeat. SQ Sequence: Number of sequences in alignment. AM
487    * Alignment Method The order ls and fs hits are aligned to the model
488    * to build the full align. // End of alignment.
489    *
490    * Optional fields: ----------------
491    *
492    * DC Database Comment: Comment about database reference. DR Database
493    * Reference: Reference to external database. RC Reference Comment:
494    * Comment about literature reference. RN Reference Number: Reference
495    * Number. RM Reference Medline: Eight digit medline UI number. RT
496    * Reference Title: Reference Title. RA Reference Author: Reference
497    * Author RL Reference Location: Journal location. PI Previous
498    * identifier: Record of all previous ID lines. KW Keywords: Keywords.
499    * CC Comment: Comments. NE Pfam accession: Indicates a nested domain.
500    * NL Location: Location of nested domains - sequence ID, start and
501    * end of insert.
502    *
503    * Obsolete fields: ----------- AL Alignment method of seed: The
504    * method used to align the seed members.
505    */
506    // Let's save the annotations, maybe we'll be able to do something
507    // with them later...
508  152 Regex an = new Regex("(\\w+)\\s*(.*)");
509  152 if (an.search(annContent))
510    {
511  152 if (an.stringMatched(1).equals("NH"))
512    {
513  0 treeString.append(an.stringMatched(2));
514    }
515  152 else if (an.stringMatched(1).equals("TN"))
516    {
517  0 if (treeString.length() > 0)
518    {
519  0 if (treeName == null)
520    {
521  0 treeName = "Tree " + (getTreeCount() + 1);
522    }
523  0 addNewickTree(treeName, treeString.toString());
524    }
525  0 treeName = an.stringMatched(2);
526  0 treeString = new StringBuffer();
527    }
528    // TODO: JAL-3532 - this is where GF comments and database references are lost
529    // suggest overriding this method for Stockholm files to catch and properly
530    // process CC, DR etc into multivalued properties
531  152 setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));
532    }
533    }
534  1364 else if (annType.equals("GS"))
535    {
536    // Generic per-Sequence annotation, free text
537    /*
538    * Pfam uses these features: Feature Description ---------------------
539    * ----------- AC <accession> ACcession number DE <freetext>
540    * DEscription DR <db>; <accession>; Database Reference OS <organism>
541    * OrganiSm (species) OC <clade> Organism Classification (clade, etc.)
542    * LO <look> Look (Color, etc.)
543    */
544  1037 if (s.search(annContent))
545    {
546  1037 String acc = s.stringMatched(1);
547  1037 String type = s.stringMatched(2);
548  1037 String content = s.stringMatched(3);
549    // TODO: store DR in a vector.
550    // TODO: store AC according to generic file db annotation.
551  1037 Hashtable ann;
552  1037 if (seqAnn.containsKey(acc))
553    {
554  148 ann = (Hashtable) seqAnn.get(acc);
555    }
556    else
557    {
558  889 ann = new Hashtable();
559    }
560  1037 ann.put(type, content);
561  1037 seqAnn.put(acc, ann);
562    }
563    else
564    {
565    // throw new IOException("Error parsing " + line);
566  0 System.err.println(">> missing annotation: " + line);
567    }
568    }
569  327 else if (annType.equals("GC"))
570    {
571    // Generic per-Column annotation, exactly 1 char per column
572    // always need a label.
573  13 if (x.search(annContent))
574    {
575    // parse out and create alignment annotation directly.
576  13 parseAnnotationRow(annotations, x.stringMatched(1),
577    x.stringMatched(2));
578    }
579    }
580  314 else if (annType.equals("GR"))
581    {
582    // Generic per-Sequence AND per-Column markup, exactly 1 char per
583    // column
584    /*
585    * Feature Description Markup letters ------- -----------
586    * -------------- SS Secondary Structure [HGIEBTSCX] SA Surface
587    * Accessibility [0-9X] (0=0%-10%; ...; 9=90%-100%) TM TransMembrane
588    * [Mio] PP Posterior Probability [0-9*] (0=0.00-0.05; 1=0.05-0.15;
589    * *=0.95-1.00) LI LIgand binding [*] AS Active Site [*] IN INtron (in
590    * or after) [0-2]
591    */
592  314 if (s.search(annContent))
593    {
594  314 String acc = s.stringMatched(1);
595  314 String type = s.stringMatched(2);
596  314 String oseq = s.stringMatched(3);
597    /*
598    * copy of annotation field that may be processed into whitespace chunks
599    */
600  314 String seq = new String(oseq);
601   
602  314 Hashtable ann;
603    // Get an object with all the annotations for this sequence
604  314 if (seqAnn.containsKey(acc))
605    {
606    // logger.debug("Found annotations for " + acc);
607  309 ann = (Hashtable) seqAnn.get(acc);
608    }
609    else
610    {
611    // logger.debug("Creating new annotations holder for " + acc);
612  5 ann = new Hashtable();
613  5 seqAnn.put(acc, ann);
614    }
615   
616    // // start of block for appending annotation lines for wrapped
617    // stokchholm file
618    // TODO test structure, call parseAnnotationRow with vector from
619    // hashtable for specific sequence
620   
621  314 Hashtable features;
622    // Get an object with all the content for an annotation
623  314 if (ann.containsKey("features"))
624    {
625    // logger.debug("Found features for " + acc);
626  0 features = (Hashtable) ann.get("features");
627    }
628    else
629    {
630    // logger.debug("Creating new features holder for " + acc);
631  314 features = new Hashtable();
632  314 ann.put("features", features);
633    }
634   
635  314 Hashtable content;
636  314 if (features.containsKey(this.id2type(type)))
637    {
638    // logger.debug("Found content for " + this.id2type(type));
639  0 content = (Hashtable) features.get(this.id2type(type));
640    }
641    else
642    {
643    // logger.debug("Creating new content holder for " +
644    // this.id2type(type));
645  314 content = new Hashtable();
646  314 features.put(this.id2type(type), content);
647    }
648  314 String ns = (String) content.get(ANNOTATION);
649   
650  314 if (ns == null)
651    {
652  314 ns = "";
653    }
654    // finally, append the annotation line
655  314 ns += seq;
656  314 content.put(ANNOTATION, ns);
657    // // end of wrapped annotation block.
658    // // Now a new row is created with the current set of data
659   
660  314 Hashtable strucAnn;
661  314 if (seqAnn.containsKey(acc))
662    {
663  314 strucAnn = (Hashtable) seqAnn.get(acc);
664    }
665    else
666    {
667  0 strucAnn = new Hashtable();
668    }
669   
670  314 Vector<AlignmentAnnotation> newStruc = new Vector<>();
671  314 parseAnnotationRow(newStruc, type, ns);
672  314 for (AlignmentAnnotation alan : newStruc)
673    {
674  314 alan.visible = false;
675    }
676    // new annotation overwrites any existing annotation...
677   
678  314 strucAnn.put(type, newStruc);
679  314 seqAnn.put(acc, strucAnn);
680    }
681    // }
682    else
683    {
684  0 System.err.println(
685    "Warning - couldn't parse sequence annotation row line:\n"
686    + line);
687    // throw new IOException("Error parsing " + line);
688    }
689    }
690    else
691    {
692  0 throw new IOException(MessageManager.formatMessage(
693    "exception.unknown_annotation_detected", new String[]
694    { annType, annContent }));
695    }
696    }
697    }
698  0 if (treeString.length() > 0)
699    {
700  0 if (treeName == null)
701    {
702  0 treeName = "Tree " + (1 + getTreeCount());
703    }
704  0 addNewickTree(treeName, treeString.toString());
705    }
706    }
707   
708    /**
709    * Demangle an accession string and guess the originating sequence database
710    * for a given sequence
711    *
712    * @param seqO
713    * sequence to be annotated
714    * @param dbr
715    * Accession string for sequence
716    * @param dbsource
717    * source database for alignment (PFAM or RFAM)
718    */
 
719  889 toggle private void guessDatabaseFor(Sequence seqO, String dbr, String dbsource)
720    {
721  889 DBRefEntry dbrf = null;
722  889 List<DBRefEntry> dbrs = new ArrayList<>();
723  889 String seqdb = "Unknown", sdbac = "" + dbr;
724  889 int st = -1, en = -1, p;
725  ? if ((st = sdbac.indexOf("/")) > -1)
726    {
727  209 String num, range = sdbac.substring(st + 1);
728  209 sdbac = sdbac.substring(0, st);
729  ? if ((p = range.indexOf("-")) > -1)
730    {
731  209 p++;
732  209 if (p < range.length())
733    {
734  209 num = range.substring(p).trim();
735  209 try
736    {
737  209 en = Integer.parseInt(num);
738    } catch (NumberFormatException x)
739    {
740    // could warn here that index is invalid
741  0 en = -1;
742    }
743    }
744    }
745    else
746    {
747  0 p = range.length();
748    }
749  209 num = range.substring(0, p).trim();
750  209 try
751    {
752  209 st = Integer.parseInt(num);
753    } catch (NumberFormatException x)
754    {
755    // could warn here that index is invalid
756  209 st = -1;
757    }
758    }
759  889 if (dbsource == null)
760    {
761    // make up an origin based on whether the sequence looks like it is nucleotide
762    // or protein
763  88 dbsource = (seqO.isProtein()) ? "PFAM" : "RFAM";
764    }
765  889 if (dbsource.equals("PFAM"))
766    {
767  619 seqdb = "UNIPROT";
768  619 if (sdbac.indexOf(".") > -1)
769    {
770    // strip of last subdomain
771  412 sdbac = sdbac.substring(0, sdbac.indexOf("."));
772  412 dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource,
773    sdbac);
774  412 if (dbrf != null)
775    {
776  412 dbrs.add(dbrf);
777    }
778    }
779  619 dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource,
780    dbr);
781  619 if (dbr != null)
782    {
783  619 dbrs.add(dbrf);
784    }
785    }
786    else
787    {
788  270 seqdb = "EMBL"; // total guess - could be ENA, or something else these
789    // days
790  270 if (sdbac.indexOf(".") > -1)
791    {
792    // strip off last subdomain
793  209 sdbac = sdbac.substring(0, sdbac.indexOf("."));
794  209 dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource,
795    sdbac);
796  209 if (dbrf != null)
797    {
798  209 dbrs.add(dbrf);
799    }
800    }
801   
802  270 dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource,
803    dbr);
804  270 if (dbrf != null)
805    {
806  270 dbrs.add(dbrf);
807    }
808    }
809  889 if (st != -1 && en != -1)
810    {
811  0 for (DBRefEntry d : dbrs)
812    {
813  0 jalview.util.MapList mp = new jalview.util.MapList(
814    new int[]
815    { seqO.getStart(), seqO.getEnd() }, new int[] { st, en }, 1,
816    1);
817  0 jalview.datamodel.Mapping mping = new Mapping(mp);
818  0 d.setMap(mping);
819    }
820    }
821    }
822   
 
823  327 toggle protected static AlignmentAnnotation parseAnnotationRow(
824    Vector<AlignmentAnnotation> annotation, String label,
825    String annots)
826    {
827  327 String convert1, convert2 = null;
828   
829    // convert1 = OPEN_PAREN.replaceAll(annots);
830    // convert2 = CLOSE_PAREN.replaceAll(convert1);
831    // annots = convert2;
832   
833  327 String type = label;
834  327 if (label.contains("_cons"))
835    {
836  11 type = (label.indexOf("_cons") == label.length() - 5)
837    ? label.substring(0, label.length() - 5)
838    : label;
839    }
840  327 boolean ss = false, posterior = false;
841  327 type = id2type(type);
842   
843  327 boolean isrnass = false;
844  327 if (type.equalsIgnoreCase("secondary structure"))
845    {
846  321 ss = true;
847  321 isrnass = !NOT_RNASS.search(annots); // sorry about the double negative
848    // here (it's easier for dealing with
849    // other non-alpha-non-brace chars)
850    }
851  327 if (type.equalsIgnoreCase("posterior probability"))
852    {
853  0 posterior = true;
854    }
855    // decide on secondary structure or not.
856  327 Annotation[] els = new Annotation[annots.length()];
857  25856 for (int i = 0; i < annots.length(); i++)
858    {
859  25529 String pos = annots.substring(i, i + 1);
860  25529 Annotation ann;
861  25529 ann = new Annotation(pos, "", ' ', 0f); // 0f is 'valid' null - will not
862    // be written out
863  25529 if (ss)
864    {
865    // if (" .-_".indexOf(pos) == -1)
866    {
867  24878 if (isrnass && RNASS_BRACKETS.indexOf(pos) >= 0)
868    {
869  9834 ann.secondaryStructure = Rna.getRNASecStrucState(pos).charAt(0);
870  9834 ann.displayCharacter = "" + pos.charAt(0);
871    }
872    else
873    {
874  15044 ann.secondaryStructure = ResidueProperties.getDssp3state(pos)
875    .charAt(0);
876   
877  15044 if (ann.secondaryStructure == pos.charAt(0))
878    {
879  668 ann.displayCharacter = ""; // null; // " ";
880    }
881    else
882    {
883  14376 ann.displayCharacter = " " + ann.displayCharacter;
884    }
885    }
886    }
887   
888    }
889  25529 if (posterior && !ann.isWhitespace()
890    && !Comparison.isGap(pos.charAt(0)))
891    {
892  0 float val = 0;
893    // symbol encodes values - 0..*==0..10
894  0 if (pos.charAt(0) == '*')
895    {
896  0 val = 10;
897    }
898    else
899    {
900  0 val = pos.charAt(0) - '0';
901  0 if (val > 9)
902    {
903  0 val = 10;
904    }
905    }
906  0 ann.value = val;
907    }
908   
909  25529 els[i] = ann;
910    }
911  327 AlignmentAnnotation annot = null;
912  327 Enumeration<AlignmentAnnotation> e = annotation.elements();
913  333 while (e.hasMoreElements())
914    {
915  6 annot = e.nextElement();
916  6 if (annot.label.equals(type))
917    {
918  0 break;
919    }
920  6 annot = null;
921    }
922  327 if (annot == null)
923    {
924  327 annot = new AlignmentAnnotation(type, type, els);
925  327 annotation.addElement(annot);
926    }
927    else
928    {
929  0 Annotation[] anns = new Annotation[annot.annotations.length
930    + els.length];
931  0 System.arraycopy(annot.annotations, 0, anns, 0,
932    annot.annotations.length);
933  0 System.arraycopy(els, 0, anns, annot.annotations.length, els.length);
934  0 annot.annotations = anns;
935    // System.out.println("else: ");
936    }
937  327 return annot;
938    }
939   
 
940  281 toggle private String dbref_to_ac_record(DBRefEntry ref)
941    {
942  281 return ref.getSource().toString() + " ; "
943    + ref.getAccessionId().toString();
944    }
 
945  11 toggle @Override
946    public String print(SequenceI[] s, boolean jvSuffix)
947    {
948  11 out = new StringBuffer();
949  11 out.append("# STOCKHOLM 1.0");
950  11 out.append(newline);
951   
952    // find max length of id
953  11 int max = 0;
954  11 int maxid = 0;
955  11 int in = 0;
956  11 int slen = s.length;
957  11 SequenceI seq;
958  11 Hashtable<String, String> dataRef = null;
959  11 boolean isAA = s[in].isProtein();
960  ? while ((in < slen) && ((seq = s[in]) != null))
961    {
962  295 String tmp = printId(seq, jvSuffix);
963  295 max = Math.max(max, seq.getLength());
964   
965  295 if (tmp.length() > maxid)
966    {
967  16 maxid = tmp.length();
968    }
969  295 List<DBRefEntry> seqrefs = seq.getDBRefs();
970  295 int ndb;
971  ? if (seqrefs != null && (ndb = seqrefs.size()) > 0)
972    {
973  268 if (dataRef == null)
974    {
975  3 dataRef = new Hashtable<>();
976    }
977  268 List<DBRefEntry> primrefs = seq.getPrimaryDBRefs();
978  268 if (primrefs.size() >= 1)
979    {
980  1 dataRef.put(tmp, dbref_to_ac_record(primrefs.get(0)));
981    }
982    else
983    {
984  280 for (int idb = 0; idb < seq.getDBRefs().size(); idb++)
985    {
986  280 DBRefEntry dbref = seq.getDBRefs().get(idb);
987  280 dataRef.put(tmp, dbref_to_ac_record(dbref));
988    // if we put in a uniprot or EMBL record then we're done:
989  280 if (isAA && DBRefSource.UNIPROT
990    .equals(DBRefUtils.getCanonicalName(dbref.getSource())))
991    {
992  206 break;
993    }
994  74 if (!isAA && DBRefSource.EMBL
995    .equals(DBRefUtils.getCanonicalName(dbref.getSource())))
996    {
997  61 break;
998    }
999    }
1000    }
1001    }
1002  295 in++;
1003    }
1004  11 maxid += 9;
1005  11 int i = 0;
1006   
1007    // output database type
1008  11 if (al.getProperties() != null)
1009    {
1010  3 if (!al.getProperties().isEmpty())
1011    {
1012  3 Enumeration key = al.getProperties().keys();
1013  3 Enumeration val = al.getProperties().elements();
1014  41 while (key.hasMoreElements())
1015    {
1016  38 out.append("#=GF " + key.nextElement() + " " + val.nextElement());
1017  38 out.append(newline);
1018    }
1019    }
1020    }
1021   
1022    // output database accessions
1023  11 if (dataRef != null)
1024    {
1025  3 Enumeration<String> en = dataRef.keys();
1026  271 while (en.hasMoreElements())
1027    {
1028  268 Object idd = en.nextElement();
1029  268 String type = dataRef.remove(idd);
1030  268 out.append(new Format("%-" + (maxid - 2) + "s")
1031    .form("#=GS " + idd.toString() + " "));
1032  268 if (isAA && type.contains("UNIPROT")
1033    || (!isAA && type.contains("EMBL")))
1034    {
1035   
1036  268 out.append(" AC " + type.substring(type.indexOf(";") + 1));
1037    }
1038    else
1039    {
1040  0 out.append(" DR " + type + " ");
1041    }
1042  268 out.append(newline);
1043    }
1044    }
1045   
1046    // output annotations
1047  ? while (i < slen && (seq = s[i]) != null)
1048    {
1049  295 AlignmentAnnotation[] alAnot = seq.getAnnotation();
1050  295 if (alAnot != null)
1051    {
1052  83 Annotation[] ann;
1053  166 for (int j = 0, nj = alAnot.length; j < nj; j++)
1054    {
1055   
1056  83 String key = type2id(alAnot[j].label);
1057  83 boolean isrna = alAnot[j].isValidStruc();
1058   
1059  83 if (isrna)
1060    {
1061    // hardwire to secondary structure if there is RNA secondary
1062    // structure on the annotation
1063  66 key = "SS";
1064    }
1065  83 if (key == null)
1066    {
1067   
1068  4 continue;
1069    }
1070   
1071    // out.append("#=GR ");
1072  79 out.append(new Format("%-" + maxid + "s").form(
1073    "#=GR " + printId(seq, jvSuffix) + " " + key + " "));
1074  79 ann = alAnot[j].annotations;
1075  79 String sseq = "";
1076  7982 for (int k = 0, nk = ann.length; k < nk; k++)
1077    {
1078  7903 sseq += outputCharacter(key, k, isrna, ann, seq);
1079    }
1080  79 out.append(sseq);
1081  79 out.append(newline);
1082    }
1083    }
1084   
1085  295 out.append(new Format("%-" + maxid + "s")
1086    .form(printId(seq, jvSuffix) + " "));
1087  295 out.append(seq.getSequenceAsString());
1088  295 out.append(newline);
1089  295 i++;
1090    }
1091   
1092    // alignment annotation
1093  11 AlignmentAnnotation aa;
1094  11 AlignmentAnnotation[] an = al.getAlignmentAnnotation();
1095  11 if (an != null)
1096    {
1097  94 for (int ia = 0, na = an.length; ia < na; ia++)
1098    {
1099  86 aa = an[ia];
1100  86 if (aa.autoCalculated || !aa.visible || aa.sequenceRef != null)
1101    {
1102  81 continue;
1103    }
1104  5 String sseq = "";
1105  5 String label;
1106  5 String key = "";
1107  5 if (aa.label.equals("seq"))
1108    {
1109  1 label = "seq_cons";
1110    }
1111    else
1112    {
1113  4 key = type2id(aa.label.toLowerCase());
1114  4 if (key == null)
1115    {
1116  0 label = aa.label;
1117    }
1118    else
1119    {
1120  4 label = key + "_cons";
1121    }
1122    }
1123  5 if (label == null)
1124    {
1125  0 label = aa.label;
1126    }
1127  5 label = label.replace(" ", "_");
1128   
1129  5 out.append(
1130    new Format("%-" + maxid + "s").form("#=GC " + label + " "));
1131  5 boolean isrna = aa.isValidStruc();
1132  453 for (int j = 0, nj = aa.annotations.length; j < nj; j++)
1133    {
1134  448 sseq += outputCharacter(key, j, isrna, aa.annotations, null);
1135    }
1136  5 out.append(sseq);
1137  5 out.append(newline);
1138    }
1139    }
1140   
1141  11 out.append("//");
1142  11 out.append(newline);
1143   
1144  11 return out.toString();
1145    }
1146   
1147    /**
1148    * add an annotation character to the output row
1149    *
1150    * @param seq
1151    * @param key
1152    * @param k
1153    * @param isrna
1154    * @param ann
1155    * @param sequenceI
1156    */
 
1157  8351 toggle private char outputCharacter(String key, int k, boolean isrna,
1158    Annotation[] ann, SequenceI sequenceI)
1159    {
1160  8351 char seq = ' ';
1161  8351 Annotation annot = ann[k];
1162  8351 String ch = (annot == null)
1163  2440 ? ((sequenceI == null) ? "-"
1164    : Character.toString(sequenceI.getCharAt(k)))
1165  5911 : (annot.displayCharacter == null
1166    ? String.valueOf(annot.secondaryStructure)
1167    : annot.displayCharacter);
1168  8351 if (ch == null)
1169    {
1170  0 ch = " ";
1171    }
1172  8351 if (key != null && key.equals("SS"))
1173    {
1174  8134 char ssannotchar = ' ';
1175  8134 boolean charset = false;
1176  8134 if (annot == null)
1177    {
1178    // sensible gap character
1179  2440 ssannotchar = ' ';
1180  2440 charset = true;
1181    }
1182    else
1183    {
1184    // valid secondary structure AND no alternative label (e.g. ' B')
1185  5694 if (annot.secondaryStructure > ' ' && ch.length() < 2)
1186    {
1187  3326 ssannotchar = annot.secondaryStructure;
1188  3326 charset = true;
1189    }
1190    }
1191  8134 if (charset)
1192    {
1193  5766 return (ssannotchar == ' ' && isrna) ? '.' : ssannotchar;
1194    }
1195    }
1196   
1197  2585 if (ch.length() == 0)
1198    {
1199  0 seq = '.';
1200    }
1201  2585 else if (ch.length() == 1)
1202    {
1203  454 seq = ch.charAt(0);
1204    }
1205  2131 else if (ch.length() > 1)
1206    {
1207  2131 seq = ch.charAt(1);
1208    }
1209   
1210  2585 return (seq == ' ' && key != null && key.equals("SS") && isrna) ? '.'
1211    : seq;
1212    }
1213   
 
1214  0 toggle public String print()
1215    {
1216  0 out = new StringBuffer();
1217  0 out.append("# STOCKHOLM 1.0");
1218  0 out.append(newline);
1219  0 print(getSeqsAsArray(), false);
1220   
1221  0 out.append("//");
1222  0 out.append(newline);
1223  0 return out.toString();
1224    }
1225   
1226    private static Hashtable typeIds = null;
1227   
 
1228  1 toggle static
1229    {
1230  1 if (typeIds == null)
1231    {
1232  1 typeIds = new Hashtable();
1233  1 typeIds.put("SS", "Secondary Structure");
1234  1 typeIds.put("SA", "Surface Accessibility");
1235  1 typeIds.put("TM", "transmembrane");
1236  1 typeIds.put("PP", "Posterior Probability");
1237  1 typeIds.put("LI", "ligand binding");
1238  1 typeIds.put("AS", "active site");
1239  1 typeIds.put("IN", "intron");
1240  1 typeIds.put("IR", "interacting residue");
1241  1 typeIds.put("AC", "accession");
1242  1 typeIds.put("OS", "organism");
1243  1 typeIds.put("CL", "class");
1244  1 typeIds.put("DE", "description");
1245  1 typeIds.put("DR", "reference");
1246  1 typeIds.put("LO", "look");
1247  1 typeIds.put("RF", "Reference Positions");
1248   
1249    }
1250    }
1251   
 
1252  955 toggle protected static String id2type(String id)
1253    {
1254  955 if (typeIds.containsKey(id))
1255    {
1256  952 return (String) typeIds.get(id);
1257    }
1258  3 System.err.println(
1259    "Warning : Unknown Stockholm annotation type code " + id);
1260  3 return id;
1261    }
1262   
 
1263  401 toggle protected static String type2id(String type)
1264    {
1265  401 String key = null;
1266  401 Enumeration e = typeIds.keys();
1267  4428 while (e.hasMoreElements())
1268    {
1269  4424 Object ll = e.nextElement();
1270  4424 if (typeIds.get(ll).toString().equalsIgnoreCase(type))
1271    {
1272  397 key = (String) ll;
1273  397 break;
1274    }
1275    }
1276  401 if (key != null)
1277    {
1278  397 return key;
1279    }
1280  4 System.err.println(
1281    "Warning : Unknown Stockholm annotation type: " + type);
1282  4 return key;
1283    }
1284   
1285    /**
1286    * make a friendly ID string.
1287    *
1288    * @param dataName
1289    * @return truncated dataName to after last '/'
1290    */
 
1291  0 toggle private String safeName(String dataName)
1292    {
1293  0 int b = 0;
1294  0 while ((b = dataName.indexOf("/")) > -1 && b < dataName.length())
1295    {
1296  0 dataName = dataName.substring(b + 1).trim();
1297   
1298    }
1299  0 int e = (dataName.length() - dataName.indexOf(".")) + 1;
1300  0 dataName = dataName.substring(1, e).trim();
1301  0 return dataName;
1302    }
1303    }