Clover icon

Coverage Report

  1. Project Clover database Mon Nov 11 2024 15:05:32 GMT
  2. Package jalview.ws.dbsources

File Uniprot.java

 

Coverage histogram

../../../img/srcFileCovDistChart8.png
20% of files have more coverage

Code metrics

74
166
17
1
604
413
69
0.42
9.76
17
4.06

Classes

Class Line # Actions
Uniprot 70 166 69
0.7120622471.2%
 

Contributing tests

This file is covered by 203 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.ws.dbsources;
22   
23    import java.io.InputStream;
24    import java.net.HttpURLConnection;
25    import java.net.URL;
26    import java.util.ArrayList;
27    import java.util.List;
28    import java.util.Locale;
29    import java.util.Vector;
30   
31    import javax.xml.bind.JAXBContext;
32    import javax.xml.bind.JAXBElement;
33    import javax.xml.bind.JAXBException;
34    import javax.xml.stream.FactoryConfigurationError;
35    import javax.xml.stream.XMLInputFactory;
36    import javax.xml.stream.XMLStreamException;
37    import javax.xml.stream.XMLStreamReader;
38   
39    import com.stevesoft.pat.Regex;
40   
41    import jalview.bin.Cache;
42    import jalview.bin.Console;
43    import jalview.datamodel.Alignment;
44    import jalview.datamodel.AlignmentI;
45    import jalview.datamodel.DBRefEntry;
46    import jalview.datamodel.DBRefSource;
47    import jalview.datamodel.PDBEntry;
48    import jalview.datamodel.Sequence;
49    import jalview.datamodel.SequenceFeature;
50    import jalview.datamodel.SequenceI;
51    import jalview.schemes.ResidueProperties;
52    import jalview.util.HttpUtils;
53    import jalview.util.StringUtils;
54    import jalview.ws.seqfetcher.DbSourceProxyImpl;
55    import jalview.xml.binding.uniprot.DbReferenceType;
56    import jalview.xml.binding.uniprot.Entry;
57    import jalview.xml.binding.uniprot.FeatureType;
58    import jalview.xml.binding.uniprot.LocationType;
59    import jalview.xml.binding.uniprot.PositionType;
60    import jalview.xml.binding.uniprot.PropertyType;
61   
62    /**
63    * This class queries the Uniprot database for sequence data, unmarshals the
64    * returned XML, and converts it to Jalview Sequence records (including attached
65    * database references and sequence features)
66    *
67    * @author JimP
68    *
69    */
 
70    public class Uniprot extends DbSourceProxyImpl
71    {
72    private static final String DEFAULT_UNIPROT_DOMAIN = "https://rest.uniprot.org";
73   
74    private static final String BAR_DELIMITER = "|";
75   
76    /**
77    * Constructor
78    */
 
79  22 toggle public Uniprot()
80    {
81  22 super();
82    }
83   
 
84  0 toggle private String getDomain()
85    {
86  0 return Cache.getDefault("UNIPROT_DOMAIN", DEFAULT_UNIPROT_DOMAIN);
87    }
88   
89    /*
90    * (non-Javadoc)
91    *
92    * @see jalview.ws.DbSourceProxy#getAccessionSeparator()
93    */
 
94  0 toggle @Override
95    public String getAccessionSeparator()
96    {
97  0 return null;
98    }
99   
100    /*
101    * (non-Javadoc)
102    *
103    * @see jalview.ws.DbSourceProxy#getAccessionValidator()
104    */
 
105  0 toggle @Override
106    public Regex getAccessionValidator()
107    {
108  0 return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)");
109    }
110   
111    /*
112    * (non-Javadoc)
113    *
114    * @see jalview.ws.DbSourceProxy#getDbSource()
115    */
 
116  2892 toggle @Override
117    public String getDbSource()
118    {
119  2892 return DBRefSource.UNIPROT;
120    }
121   
122    /*
123    * (non-Javadoc)
124    *
125    * @see jalview.ws.DbSourceProxy#getDbVersion()
126    */
 
127  2 toggle @Override
128    public String getDbVersion()
129    {
130  2 return "0"; // we really don't know what version we're on.
131    }
132   
133    /*
134    * (non-Javadoc)
135    *
136    * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[])
137    */
 
138  0 toggle @Override
139    public AlignmentI getSequenceRecords(String queries) throws Exception
140    {
141  0 startQuery();
142  0 try
143    {
144  0 queries = queries.toUpperCase(Locale.ROOT).replaceAll(
145    "(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", "");
146  0 AlignmentI al = null;
147   
148  0 String downloadstring = getDomain() + "/uniprotkb/" + queries
149    + ".xml";
150   
151  0 URL url = new URL(downloadstring);
152  0 HttpURLConnection urlconn = (HttpURLConnection) HttpUtils
153    .openConnection(url);
154    // anything other than 200 means we don't have data
155    // TODO: JAL-3882 reuse the EnsemblRestClient's fair
156    // use/backoff logic to retry when the server tells us to go away
157  0 if (urlconn.getResponseCode() == 200)
158    {
159  0 InputStream istr = urlconn.getInputStream();
160  0 List<Entry> entries = getUniprotEntries(istr);
161  0 if (entries != null)
162    {
163  0 List<SequenceI> seqs = new ArrayList<>();
164  0 for (Entry entry : entries)
165    {
166  0 seqs.add(uniprotEntryToSequence(entry));
167    }
168  0 al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
169    }
170    }
171  0 stopQuery();
172  0 return al;
173   
174    } catch (Exception e)
175    {
176  0 throw (e);
177    } finally
178    {
179  0 stopQuery();
180    }
181    }
182   
183    /**
184    * Converts an Entry object (bound from Uniprot XML) to a Jalview Sequence
185    *
186    * @param entry
187    * @return
188    */
 
189  2 toggle SequenceI uniprotEntryToSequence(Entry entry)
190    {
191  2 String id = getUniprotEntryId(entry);
192    /*
193    * Sequence should not include any whitespace, but JAXB leaves these in
194    */
195  2 String seqString = entry.getSequence().getValue().replaceAll("\\s*",
196    "");
197   
198  2 SequenceI sequence = new Sequence(id, seqString);
199  2 sequence.setDescription(getUniprotEntryDescription(entry));
200  2 final String uniprotRecordVersion = "" + entry.getVersion();
201    /*
202    * add a 'self' DBRefEntry for each accession
203    */
204  2 final String dbVersion = getDbVersion();
205  2 List<DBRefEntry> dbRefs = new ArrayList<>();
206  2 boolean canonical = true;
207  2 for (String accessionId : entry.getAccession())
208    {
209  4 DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT,
210    uniprotRecordVersion, accessionId, null, canonical);
211  4 canonical = false;
212  4 dbRefs.add(dbRef);
213    }
214   
215    /*
216    * add a DBRefEntry for each dbReference element in the XML;
217    * also add a PDBEntry if type="PDB";
218    * also add an EMBLCDS dbref if protein sequence id is given
219    * also add an Ensembl dbref " " " " " "
220    */
221  2 Vector<PDBEntry> pdbRefs = new Vector<>();
222  2 for (DbReferenceType dbref : entry.getDbReference())
223    {
224  48 String type = dbref.getType();
225  48 DBRefEntry dbr = new DBRefEntry(type,
226    DBRefSource.UNIPROT + ":" + dbVersion, dbref.getId());
227  48 dbRefs.add(dbr);
228  48 if ("PDB".equals(type))
229    {
230  1 pdbRefs.add(new PDBEntry(dbr));
231    }
232  48 if ("EMBL".equals(type))
233    {
234    /*
235    * e.g. Uniprot accession Q9BXM7 has
236    * <dbReference type="EMBL" id="M19359">
237    * <property type="protein sequence ID" value="AAA40981.1"/>
238    * <property type="molecule type" value="Genomic_DNA"/>
239    * </dbReference>
240    */
241  9 String cdsId = getProperty(dbref.getProperty(),
242    "protein sequence ID");
243  9 if (cdsId != null && cdsId.trim().length() > 0)
244    {
245    // remove version
246  9 String[] vrs = cdsId.split("\\.");
247  9 String version = vrs.length > 1 ? vrs[1]
248    : DBRefSource.UNIPROT + ":" + uniprotRecordVersion;
249  9 dbr = new DBRefEntry(DBRefSource.EMBLCDS, version, vrs[0]);
250    // TODO: process VARIANT features to allow EMBLCDS record's product to
251    // match Uniprot
252  9 dbr.setCanonical(true);
253  9 dbRefs.add(dbr);
254    }
255    }
256  48 if (type != null
257    && type.toLowerCase(Locale.ROOT).startsWith("ensembl"))
258    {
259    // remove version
260  0 String[] vrs = dbref.getId().split("\\.");
261  0 String version = vrs.length > 1 ? vrs[1]
262    : DBRefSource.UNIPROT + ":" + uniprotRecordVersion;
263  0 dbr.setAccessionId(vrs[0]);
264  0 dbr.setVersion(version);
265    /*
266    * e.g. Uniprot accession Q9BXM7 has
267    * <dbReference type="Ensembl" id="ENST00000321556">
268    * <molecule id="Q9BXM7-1"/>
269    * <property type="protein sequence ID" value="ENSP00000364204"/>
270    * <property type="gene ID" value="ENSG00000158828"/>
271    * </dbReference>
272    */
273  0 String cdsId = getProperty(dbref.getProperty(),
274    "protein sequence ID");
275  0 if (cdsId != null && cdsId.trim().length() > 0)
276    {
277    // remove version
278  0 String[] cdsVrs = cdsId.split("\\.");
279  0 String cdsVersion = cdsVrs.length > 1 ? cdsVrs[1]
280    : DBRefSource.UNIPROT + ":" + uniprotRecordVersion;
281  0 dbr = new DBRefEntry(DBRefSource.ENSEMBL,
282    DBRefSource.UNIPROT + ":" + cdsVersion, cdsVrs[0]);
283  0 dbRefs.add(dbr);
284    }
285    }
286    }
287   
288    /*
289    * create features; they have either begin and end, or position, in XML
290    */
291  2 sequence.setPDBId(pdbRefs);
292  2 if (entry.getFeature() != null)
293    {
294  2 for (FeatureType uf : entry.getFeature())
295    {
296  21 LocationType location = uf.getLocation();
297  21 int start = 0;
298  21 int end = 0;
299  21 String uncertain_start = null, uncertain_end = null,
300    uncertain_pos = null;
301  21 if (location.getPosition() != null)
302    {
303  8 if (location.getPosition().getPosition() == null
304    || "unknown".equals(location.getPosition().getStatus()))
305    {
306  0 Console.warn(
307    "Ignoring single position feature with uncertain location "
308    + uf.getType() + ":" + getDescription(uf));
309  0 uncertain_pos = location.getPosition().getStatus() == null
310    ? "unknown"
311    : location.getPosition().getStatus();
312    }
313    else
314    {
315  8 start = location.getPosition().getPosition().intValue();
316  8 end = start;
317    }
318    }
319    else
320    {
321  13 if (location.getBegin().getPosition() == null)
322    {
323  1 Console.warn(
324    "Setting start position of feature with uncertain start to 1: "
325    + uf.getType() + ":" + getDescription(uf));
326  1 start = sequence.getStart();
327  1 uncertain_start = location.getBegin().getStatus();
328    }
329    else
330    {
331  12 start = location.getBegin().getPosition().intValue();
332    }
333  13 if (location.getEnd().getPosition() == null)
334    {
335  1 Console.warn(
336    "Setting start position of feature with uncertain start to 1: "
337    + uf.getType() + ":" + getDescription(uf));
338  1 end = sequence.getEnd();
339  1 uncertain_end = location.getEnd().getStatus();
340    }
341    else
342    {
343  12 end = location.getEnd().getPosition().intValue();
344    }
345    }
346  21 SequenceFeature sf = new SequenceFeature(uf.getType(),
347    getDescription(uf), start, end, "Uniprot");
348  21 sf.setStatus(uf.getStatus());
349  21 if (uncertain_end != null)
350    {
351  1 sf.setValue("end_status", uncertain_end);
352    }
353  21 if (uncertain_start != null)
354    {
355  1 sf.setValue("start_status", uncertain_start);
356    }
357  21 if (uncertain_pos != null)
358    {
359  0 sf.setValue("pos_status", uncertain_pos);
360    }
361  21 sequence.addSequenceFeature(sf);
362    }
363    }
364  2 for (DBRefEntry dbr : dbRefs)
365    {
366  61 sequence.addDBRef(dbr);
367    }
368  2 return sequence;
369    }
370   
371    /**
372    * A helper method that builds a sequence feature description
373    *
374    * @param feature
375    * @return
376    */
 
377  35 toggle static String getDescription(FeatureType feature)
378    {
379  35 String orig = feature.getOriginal();
380  35 List<String> variants = feature.getVariation();
381  35 StringBuilder sb = new StringBuilder();
382   
383    /*
384    * append variant in standard format if present
385    * e.g. p.Arg59Lys
386    * multiple variants are split over lines using <br>
387    */
388  35 boolean asHtml = false;
389  35 if (orig != null && !orig.isEmpty() && variants != null
390    && !variants.isEmpty())
391    {
392  14 int p = 0;
393  14 for (String var : variants)
394    {
395    // TODO proper HGVS nomenclature for delins structural variations
396    // http://varnomen.hgvs.org/recommendations/protein/variant/delins/
397    // for now we are pragmatic - any orig/variant sequence longer than
398    // three characters is shown with single-character notation rather than
399    // three-letter notation
400  21 sb.append("p.");
401  21 if (orig.length() < 4)
402    {
403  36 for (int c = 0, clen = orig.length(); c < clen; c++)
404    {
405  21 char origchar = orig.charAt(c);
406  21 String orig3 = ResidueProperties.aa2Triplet.get("" + origchar);
407  21 sb.append(orig3 == null ? origchar
408    : StringUtils.toSentenceCase(orig3));
409    }
410    }
411    else
412    {
413  6 sb.append(orig);
414    }
415   
416  21 LocationType location = feature.getLocation();
417  21 PositionType start = location.getPosition() == null
418    ? location.getBegin()
419    : location.getPosition();
420  21 sb.append(Integer.toString(start.getPosition().intValue()));
421   
422  21 if (var.length() < 4)
423    {
424  52 for (int c = 0, clen = var.length(); c < clen; c++)
425    {
426  34 char varchar = var.charAt(c);
427  34 String var3 = ResidueProperties.aa2Triplet.get("" + varchar);
428   
429  34 sb.append(var3 != null ? StringUtils.toSentenceCase(var3)
430    : "" + varchar);
431    }
432    }
433    else
434    {
435  3 sb.append(var);
436    }
437  21 if (++p != variants.size())
438    {
439  7 sb.append("<br/>&nbsp;&nbsp;");
440  7 asHtml = true;
441    }
442    else
443    {
444  14 sb.append(" ");
445    }
446    }
447    }
448  35 String description = feature.getDescription();
449  35 if (description != null)
450    {
451  26 sb.append(description);
452    }
453  35 if (asHtml)
454    {
455  7 sb.insert(0, "<html>");
456  7 sb.append("</html>");
457    }
458   
459  35 return sb.toString();
460    }
461   
462    /**
463    * A helper method that searches the list of properties for one with the given
464    * key, and if found returns the property value, else returns null
465    *
466    * @param properties
467    * @param key
468    * @return
469    */
 
470  13 toggle static String getProperty(List<PropertyType> properties, String key)
471    {
472  13 String value = null;
473  13 if (properties != null)
474    {
475  13 for (PropertyType prop : properties)
476    {
477  15 if (key.equals(prop.getType()))
478    {
479  13 value = prop.getValue();
480  13 break;
481    }
482    }
483    }
484  13 return value;
485    }
486   
487    /**
488    * Extracts xml element entry/protein/recommendedName/fullName
489    *
490    * @param entry
491    * @return
492    */
 
493  3 toggle static String getUniprotEntryDescription(Entry entry)
494    {
495  3 String desc = "";
496  3 if (entry.getProtein() != null
497    && entry.getProtein().getRecommendedName() != null)
498    {
499    // fullName is mandatory if recommendedName is present
500  3 desc = entry.getProtein().getRecommendedName().getFullName()
501    .getValue();
502    }
503  3 return desc;
504    }
505   
506    /**
507    * Constructs a sequence id by concatenating all entry/name elements with '|'
508    * separator
509    *
510    * @param entry
511    * @return
512    */
 
513  3 toggle static String getUniprotEntryId(Entry entry)
514    {
515  3 StringBuilder name = new StringBuilder(32);
516  3 for (String n : entry.getName())
517    {
518  5 if (name.length() > 0)
519    {
520  2 name.append(BAR_DELIMITER);
521    }
522  5 name.append(n);
523    }
524  3 return name.toString();
525    }
526   
527    /*
528    * (non-Javadoc)
529    *
530    * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String)
531    */
 
532  0 toggle @Override
533    public boolean isValidReference(String accession)
534    {
535    // TODO: make the following a standard validator
536  0 return (accession == null || accession.length() < 2) ? false
537    : getAccessionValidator().search(accession);
538    }
539   
540    /**
541    * return LDHA_CHICK uniprot entry
542    */
 
543  0 toggle @Override
544    public String getTestQuery()
545    {
546  0 return "P00340";
547    }
548   
 
549  2876 toggle @Override
550    public String getDbName()
551    {
552  2876 return "Uniprot"; // getDbSource();
553    }
554   
 
555  0 toggle @Override
556    public int getTier()
557    {
558  0 return 0;
559    }
560   
561    /**
562    * Reads the reply to the EBI Fetch Uniprot data query, unmarshals it to an
563    * Uniprot object, and returns the enclosed Entry objects, or null on any
564    * failure
565    *
566    * @param is
567    * @return
568    */
 
569  5 toggle public List<Entry> getUniprotEntries(InputStream is)
570    {
571  5 List<Entry> entries = null;
572  5 try
573    {
574  5 JAXBContext jc = JAXBContext
575    .newInstance("jalview.xml.binding.uniprot");
576  5 XMLStreamReader streamReader = XMLInputFactory.newInstance()
577    .createXMLStreamReader(is);
578  5 javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
579  5 JAXBElement<jalview.xml.binding.uniprot.Uniprot> uniprotElement = um
580    .unmarshal(streamReader,
581    jalview.xml.binding.uniprot.Uniprot.class);
582  5 jalview.xml.binding.uniprot.Uniprot uniprot = uniprotElement
583    .getValue();
584   
585  5 if (uniprot != null && !uniprot.getEntry().isEmpty())
586    {
587  5 entries = uniprot.getEntry();
588    }
589    } catch (JAXBException | XMLStreamException
590    | FactoryConfigurationError e)
591    {
592  0 if (e instanceof javax.xml.bind.UnmarshalException
593    && e.getCause() != null
594    && e.getCause() instanceof XMLStreamException
595    && e.getCause().getMessage().contains("[row,col]:[1,1]"))
596    {
597    // trying to parse an empty stream
598  0 return null;
599    }
600  0 e.printStackTrace();
601    }
602  5 return entries;
603    }
604    }