File Gff3Helper.java

Branches:

Statements:

Methods:

Classes:

LOC:

429

NCLOC:

210

Total complexity:

Complexity density:

0.32

Statements/Method:

11.5

Methods/Class:

Average method complexity:

3.62

Classes

Class	Line #	Total Statements	Complexity	Uncovered Elements	TOTAL Coverage	Actions
Gff3Helper	40	92	29	20	0.850746385.1%

Class Gff3Helper

Class Gff3Helper	Line # 40	Total Statements 92	Complexity 29	Uncovered Elements 20	TOTAL Coverage 0.850746385.1%
parseNameValuePairs(String) : Map<String, List<String>> parseNameValuePairs(String) : Map<String, List<String>>	5757	1.01	1.01	0.00	1.0 1.0100%
processGff(SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature processGff(SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature	8181	13.013	4.04	4.04	0.7894737 0.789473778.9%
processNucleotideMatch(Map<String, List<String>>,SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature processNucleotideMatch(Map<String, List<String>>,SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature	141141	35.035	9.09	10.010	0.78723407 0.7872340778.7%
findTargetId(String,Map<String, List<String>>) : String findTargetId(String,Map<String, List<String>>) : String	248248	1.01	1.01	0.00	1.0 1.0100%
processProteinMatch(Map<String, List<String>>,SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature processProteinMatch(Map<String, List<String>>,SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature	280280	23.023	4.04	4.04	0.86206895 0.8620689586.2%
getNameValueSeparator() : char getNameValueSeparator() : char	355355	1.01	1.01	1.01	0.0 0.00%
buildSequenceFeature(String[],int,String,Map<String, List<String>>) : SequenceFeature buildSequenceFeature(String[],int,String,Map<String, List<String>>) : SequenceFeature	365365	5.05	2.02	0.00	1.0 1.0100%
getDescription(SequenceFeature,Map<String, List<String>>) : String getDescription(SequenceFeature,Map<String, List<String>>) : String	387387	13.013	7.07	0.00	1.0 1.0100%

Contributing tests

This file is covered by 7 tests. .

Contributing tests

Test contribution	Test	Result
0.5522388	jalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_forwardToReversejalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_forwardToReverse	1PASS
0.5298507	jalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_forwardToForwardjalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_forwardToForward	1PASS
0.5298507	jalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_splicedjalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_spliced	1PASS
0.37313432	jalview.io.gff.InterProScanHelperTest.testProcessProteinMatchjalview.io.gff.InterProScanHelperTest.testProcessProteinMatch	1PASS
0.29104477	jalview.io.FeaturesFileTest.testParse_pureGff3jalview.io.FeaturesFileTest.testParse_pureGff3	1PASS
0.1641791	jalview.io.gff.Gff3HelperTest.testGetDescriptionjalview.io.gff.Gff3HelperTest.testGetDescription	1PASS
0.1641791	jalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_reverseToForwardjalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_reverseToForward	1PASS

Source view

* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)

* Copyright (C) $$Year-Rel$$ The Jalview Authors

* This file is part of Jalview.

* Jalview is free software: you can redistribute it and/or

* modify it under the terms of the GNU General Public License

* as published by the Free Software Foundation, either version 3

* of the License, or (at your option) any later version.

* Jalview is distributed in the hope that it will be useful, but

* WITHOUT ANY WARRANTY; without even the implied warranty

* of MERCHANTABILITY or FITNESS FOR A PARTICULAR

* PURPOSE. See the GNU General Public License for more details.

* You should have received a copy of the GNU General Public License

* along with Jalview. If not, see <http://www.gnu.org/licenses/>.

* The Jalview Authors are detailed in the 'AUTHORS' file.

package jalview.io.gff;

import jalview.datamodel.AlignedCodonFrame;

import jalview.datamodel.AlignmentI;

import jalview.datamodel.MappingType;

import jalview.datamodel.SequenceFeature;

import jalview.datamodel.SequenceI;

import jalview.util.MapList;

import jalview.util.StringUtils;

import java.io.IOException;

import java.util.List;

import java.util.Map;

/**

* Base class with generic / common functionality for processing GFF3 data.

* Override this as required for any specialisations resulting from

* peculiarities of GFF3 generated by particular tools.

public class Gff3Helper extends GffHelperBase

{

public static final String ALLELES = "alleles";

protected static final String TARGET = "Target";

protected static final String ID = "ID";

private static final String NAME = "Name";

/**

* GFF3 uses '=' to delimit name/value pairs in column 9, and comma to

* separate multiple values for a name

* @param text

* @return

public static Map<String, List<String>> parseNameValuePairs(String text)

{

return parseNameValuePairs(text, ";", '=', ",");

}

/**

* Process one GFF feature line (as modelled by SequenceFeature)

* @param seq

* the sequence with which this feature is associated

* @param sf

* the sequence feature with ATTRIBUTES property containing any

* additional attributes

* @param align

* the alignment we are adding GFF to

* @param newseqs

* any new sequences referenced by the GFF

* @param relaxedIdMatching

* if true, match word tokens in sequence names

* @return true if the sequence feature should be added to the sequence, else

* false (i.e. it has been processed in another way e.g. to generate a

* mapping)

* @throws IOException

@Override

public SequenceFeature processGff(SequenceI seq, String[] gff,

AlignmentI align, List<SequenceI> newseqs,

boolean relaxedIdMatching) throws IOException

{

SequenceFeature sf = null;

if (gff.length == 9)

{

String soTerm = gff[TYPE_COL];

String atts = gff[ATTRIBUTES_COL];

Map<String, List<String>> attributes = parseNameValuePairs(atts);

SequenceOntologyI so = SequenceOntologyFactory.getInstance();

if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))

{

sf = processProteinMatch(attributes, seq, gff, align, newseqs,

relaxedIdMatching);

}

100

else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))

101

{

102

sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,

relaxedIdMatching);

}

else

{

sf = buildSequenceFeature(gff, attributes);

}

}

else

{

* fall back on generating a sequence feature with no special processing

114

115

sf = buildSequenceFeature(gff, null);

}

return sf;

}

/**

* Processes one GFF3 nucleotide (e.g. cDNA to genome) match.

123

124

* @param attributes

125

* parsed GFF column 9 key/value(s)

126

* @param seq

127

* the sequence the GFF feature is on

128

* @param gffColumns

129

* the GFF column data

130

* @param align

131

* the alignment the sequence belongs to, where any new mappings

132

* should be added

133

* @param newseqs

134

* a list of new 'virtual sequences' generated while parsing GFF

135

* @param relaxedIdMatching

136

* if true allow fuzzy search for a matching target sequence

137

* @return a sequence feature, if one should be added to the sequence, else

138

* null

139

* @throws IOException

140

141

protected SequenceFeature processNucleotideMatch(

142

Map<String, List<String>> attributes, SequenceI seq,

143

String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,

144

boolean relaxedIdMatching) throws IOException

145

{

146

String strand = gffColumns[STRAND_COL];

147

148

149

* (For now) we don't process mappings from reverse complement ; to do

150

* this would require (a) creating a virtual sequence placeholder for

151

* the reverse complement (b) resolving the sequence by its id from some

152

* source (GFF ##FASTA or other) (c) creating the reverse complement

153

* sequence (d) updating the mapping to be to the reverse complement

154

155

if ("-".equals(strand))

156

{

157

System.err.println(

158

"Skipping mapping from reverse complement as not yet supported");

return null;

}

List<String> targets = attributes.get(TARGET);

163

if (targets == null)

164

{

165

System.err.println("'Target' missing in GFF");

return null;

}

* Typically we only expect one Target per GFF line, but this can handle

171

* multiple matches, to the same or different sequences (e.g. dna variants)

172

173

for (String target : targets)

174

{

175

176

* Process "seqid start end [strand]"

177

178

String[] tokens = target.split(" ");

179

if (tokens.length < 3)

180

{

181

System.err.println("Incomplete Target: " + target);

continue;

}

* Locate the mapped sequence in the alignment, or as a

187

* (new or existing) virtual sequence in the newseqs list

188

189

String targetId = findTargetId(tokens[0], attributes);

190

SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,

191

relaxedIdMatching);

192

SequenceI mappedSequence = mappedSequence1;

193

if (mappedSequence == null)

{

continue;

}

* get any existing mapping for these sequences (or start one),

200

* and add this mapped range

201

202

AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);

try

{

int toStart = Integer.parseInt(tokens[1]);

207

int toEnd = Integer.parseInt(tokens[2]);

208

if (tokens.length > 3 && "-".equals(tokens[3]))

209

{

210

// mapping to reverse strand - swap start/end

int temp = toStart;

toStart = toEnd;

toEnd = temp;

}

int fromStart = Integer.parseInt(gffColumns[START_COL]);

217

int fromEnd = Integer.parseInt(gffColumns[END_COL]);

218

MapList mapping = constructMappingFromAlign(fromStart, fromEnd,

219

toStart, toEnd, MappingType.NucleotideToNucleotide);

if (mapping != null)

{

acf.addMap(seq, mappedSequence, mapping);

224

align.addCodonFrame(acf);

225

}

226

} catch (NumberFormatException nfe)

227

{

228

System.err.println("Invalid start or end in Target " + target);

}

}

SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);

return sf;

}

/**

* Returns the target sequence id extracted from the GFF name/value pairs.

238

* Default (standard behaviour) is the first token for "Target". This may be

239

* overridden where tools report this in a non-standard way.

240

241

* @param target

242

* first token of a "Target" value from GFF column 9, typically

243

* "seqid start end"

244

* @param set

245

* a map with all parsed column 9 attributes

246

* @return

247

248

@SuppressWarnings("unused")

249

protected String findTargetId(String target,

250

Map<String, List<String>> set)

{

return target;

}

/**

* Processes one GFF 'protein_match'; fields of interest are

257

* <ul>

258

* <li>feature group - the database reporting a match e.g. Pfam</li>

259

* <li>Name - the matched entry's accession id in the database</li>

260

* <li>ID - a sequence identifier for the matched region (which may be

261

* appended as FASTA in the GFF file)</li>

* </ul>

* @param set

* parsed GFF column 9 key/value(s)

266

* @param seq

267

* the sequence the GFF feature is on

268

* @param gffColumns

269

* the sequence feature holding GFF data

270

* @param align

271

* the alignment the sequence belongs to, where any new mappings

272

* should be added

273

* @param newseqs

274

* a list of new 'virtual sequences' generated while parsing GFF

275

* @param relaxedIdMatching

276

* if true allow fuzzy search for a matching target sequence

277

* @return the (real or virtual) sequence(s) mapped to by this match

278

* @throws IOException

279

280

protected SequenceFeature processProteinMatch(

281

Map<String, List<String>> set, SequenceI seq, String[] gffColumns,

282

AlignmentI align, List<SequenceI> newseqs,

283

boolean relaxedIdMatching)

284

{

285

// This is currently tailored to InterProScan GFF output:

286

// ID holds the ID of the matched sequence, Target references the

287

// query sequence; this looks wrong, as ID should just be the GFF internal

288

// ID of the GFF feature, while Target would normally reference the matched

289

// sequence.

290

// TODO refactor as needed if other protein-protein GFF varies

291

292

SequenceFeature sf = buildSequenceFeature(gffColumns, set);

293

294

295

* locate the mapped sequence in the alignment, or as a

296

* (new or existing) virtual sequence in the newseqs list

297

298

List<String> targets = set.get(TARGET);

299

if (targets != null)

300

{

301

for (String target : targets)

302

{

303

304

SequenceI mappedSequence1 = findSequence(findTargetId(target, set),

305

align, newseqs, relaxedIdMatching);

306

SequenceI mappedSequence = mappedSequence1;

307

if (mappedSequence == null)

{

continue;

}

* give the mapped sequence a copy of the sequence feature, with

314

* start/end range adjusted

315

316

int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();

317

SequenceFeature sf2 = new SequenceFeature(sf, 1,

318

sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());

319

mappedSequence.addSequenceFeature(sf2);

320

321

322

* add a property to the mapped sequence so that it can eventually be

323

* renamed with its qualified accession id; renaming has to wait until

324

* all sequence reference resolution is complete

325

326

String accessionId = StringUtils

327

.listToDelimitedString(set.get(NAME), ",");

328

if (accessionId.length() > 0)

329

{

330

String database = sf.getType(); // TODO InterProScan only??

331

String qualifiedAccId = database + "|" + accessionId;

332

sf2.setValue(RENAME_TOKEN, qualifiedAccId);

}

* get any existing mapping for these sequences (or start one),

337

* and add this mapped range

338

339

AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);

340

int[] from = new int[] { sf.getBegin(), sf.getEnd() };

341

int[] to = new int[] { 1, sequenceFeatureLength };

342

MapList mapping = new MapList(from, to, 1, 1);

343

344

alco.addMap(seq, mappedSequence, mapping);

345

align.addCodonFrame(alco);

}

}

return sf;

}

/**

* Return '=' as the name-value separator used in column 9 attributes.

354

355

@Override

356

protected char getNameValueSeparator()

{

return '=';

}

/**

* Modifies the default SequenceFeature in order to set the Target sequence id

* as the description

@Override

protected SequenceFeature buildSequenceFeature(String[] gff,

367

int typeColumn, String group,

368

Map<String, List<String>> attributes)

369

{

370

SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,

371

attributes);

372

String desc = getDescription(sf, attributes);

373

if (desc != null)

374

{

375

sf.setDescription(desc);

}

return sf;

}

/**

* Apply heuristic rules to try to get the most useful feature description

* @param sf

* @param attributes

* @return

protected String getDescription(SequenceFeature sf,

388

Map<String, List<String>> attributes)

389

{

390

String desc = null;

391

String target = (String) sf.getValue(TARGET);

392

if (target != null)

393

{

394

desc = target.split(" ")[0];

395

}

396

397

SequenceOntologyI so = SequenceOntologyFactory.getInstance();

398

String type = sf.getType();

399

if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))

400

{

401

402

* Ensembl returns dna variants as 'alleles'

403

404

desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),

",");

}

* extract 'Name' for a transcript (to show gene name)

410

* or an exon (so 'colour by label' shows exon boundaries)

411

412

if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)

413

|| so.isA(type, SequenceOntologyI.TRANSCRIPT)

414

|| so.isA(type, SequenceOntologyI.EXON))

415

{

416

desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");

}

* if the above fails, try ID

if (desc == null)

{

desc = (String) sf.getValue(ID);

}

return desc;

}

}

jalviewX

File Gff3Helper.java

Coverage histogram

Code metrics

Classes

Class Gff3Helper

Contributing tests

Contributing tests

Source view