File Gff3Helper.java

Branches:

Statements:

Methods:

Classes:

LOC:

426

NCLOC:

207

Total complexity:

Complexity density:

0.3

Statements/Method:

13.14

Methods/Class:

Average method complexity:

Classes

Class	Line #	Total Statements	Complexity	TOTAL Coverage	Actions
Gff3Helper	40	92	28	0.8646616386.5%

Class Gff3Helper

Class Gff3Helper	Line # 40	Total Statements 92	Complexity 28	TOTAL Coverage 0.8646616386.5%
parseNameValuePairs(String) : Map<String, List<String>> parseNameValuePairs(String) : Map<String, List<String>>	5757	1.01	1.01	1.0 1.0100%
processGff(SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature processGff(SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature	8181	13.013	4.04	0.7894737 0.789473778.9%
processNucleotideMatch(Map<String, List<String>>,SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature processNucleotideMatch(Map<String, List<String>>,SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature	141141	35.035	9.09	0.78723407 0.7872340778.7%
findTargetId(String,Map<String, List<String>>) : String findTargetId(String,Map<String, List<String>>) : String	249249	1.01	1.01	1.0 1.0100%
processProteinMatch(Map<String, List<String>>,SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature processProteinMatch(Map<String, List<String>>,SequenceI,String[],AlignmentI,List<SequenceI>,boolean) : SequenceFeature	281281	23.023	4.04	0.86206895 0.8620689586.2%
buildSequenceFeature(String[],int,String,Map<String, List<String>>) : SequenceFeature buildSequenceFeature(String[],int,String,Map<String, List<String>>) : SequenceFeature	357357	5.05	2.02	1.0 1.0100%
getDescription(SequenceFeature,Map<String, List<String>>) : String getDescription(SequenceFeature,Map<String, List<String>>) : String	379379	14.014	7.07	1.0 1.0100%

Contributing tests

This file is covered by 7 tests. .

Contributing tests

Test contribution	Test	Result
0.56390977	jalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_forwardToReversejalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_forwardToReverse	1PASS
0.5413534	jalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_splicedjalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_spliced	1PASS
0.5413534	jalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_forwardToForwardjalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_forwardToForward	1PASS
0.38345864	jalview.io.gff.InterProScanHelperTest.testProcessProteinMatchjalview.io.gff.InterProScanHelperTest.testProcessProteinMatch	1PASS
0.30075186	jalview.io.FeaturesFileTest.testParse_pureGff3jalview.io.FeaturesFileTest.testParse_pureGff3	1PASS
0.17293233	jalview.io.gff.Gff3HelperTest.testGetDescriptionjalview.io.gff.Gff3HelperTest.testGetDescription	1PASS
0.16541353	jalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_reverseToForwardjalview.io.gff.Gff3HelperTest.testProcessCdnaMatch_reverseToForward	1PASS

Source view

* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)

* Copyright (C) $$Year-Rel$$ The Jalview Authors

* This file is part of Jalview.

* Jalview is free software: you can redistribute it and/or

* modify it under the terms of the GNU General Public License

* as published by the Free Software Foundation, either version 3

* of the License, or (at your option) any later version.

* Jalview is distributed in the hope that it will be useful, but

* WITHOUT ANY WARRANTY; without even the implied warranty

* of MERCHANTABILITY or FITNESS FOR A PARTICULAR

* PURPOSE. See the GNU General Public License for more details.

* You should have received a copy of the GNU General Public License

* along with Jalview. If not, see <http://www.gnu.org/licenses/>.

* The Jalview Authors are detailed in the 'AUTHORS' file.

package jalview.io.gff;

import jalview.datamodel.AlignedCodonFrame;

import jalview.datamodel.AlignmentI;

import jalview.datamodel.MappingType;

import jalview.datamodel.SequenceFeature;

import jalview.datamodel.SequenceI;

import jalview.util.MapList;

import jalview.util.StringUtils;

import java.io.IOException;

import java.util.List;

import java.util.Map;

/**

* Base class with generic / common functionality for processing GFF3 data.

* Override this as required for any specialisations resulting from

* peculiarities of GFF3 generated by particular tools.

public class Gff3Helper extends GffHelperBase

{

public static final String ALLELES = "alleles";

protected static final String TARGET = "Target";

protected static final String ID = "ID";

private static final String NAME = "Name";

/**

* GFF3 uses '=' to delimit name/value pairs in column 9, and comma to

* separate multiple values for a name

* @param text

* @return

public static Map<String, List<String>> parseNameValuePairs(String text)

{

return parseNameValuePairs(text, ";", '=', ",");

}

/**

* Process one GFF feature line (as modelled by SequenceFeature)

* @param seq

* the sequence with which this feature is associated

* @param sf

* the sequence feature with ATTRIBUTES property containing any

* additional attributes

* @param align

* the alignment we are adding GFF to

* @param newseqs

* any new sequences referenced by the GFF

* @param relaxedIdMatching

* if true, match word tokens in sequence names

* @return true if the sequence feature should be added to the sequence, else

* false (i.e. it has been processed in another way e.g. to generate a

* mapping)

* @throws IOException

@Override

public SequenceFeature processGff(SequenceI seq, String[] gff,

AlignmentI align, List<SequenceI> newseqs,

boolean relaxedIdMatching) throws IOException

{

SequenceFeature sf = null;

if (gff.length == 9)

{

String soTerm = gff[TYPE_COL];

String atts = gff[ATTRIBUTES_COL];

Map<String, List<String>> attributes = parseNameValuePairs(atts);

SequenceOntologyI so = SequenceOntologyFactory.getInstance();

if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))

{

sf = processProteinMatch(attributes, seq, gff, align, newseqs,

relaxedIdMatching);

}

100

else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))

101

{

102

sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,

relaxedIdMatching);

}

else

{

sf = buildSequenceFeature(gff, attributes);

}

}

else

{

* fall back on generating a sequence feature with no special processing

114

115

sf = buildSequenceFeature(gff, null);

}

return sf;

}

/**

* Processes one GFF3 nucleotide (e.g. cDNA to genome) match.

123

124

* @param attributes

125

* parsed GFF column 9 key/value(s)

126

* @param seq

127

* the sequence the GFF feature is on

128

* @param gffColumns

129

* the GFF column data

130

* @param align

131

* the alignment the sequence belongs to, where any new mappings

132

* should be added

133

* @param newseqs

134

* a list of new 'virtual sequences' generated while parsing GFF

135

* @param relaxedIdMatching

136

* if true allow fuzzy search for a matching target sequence

137

* @return a sequence feature, if one should be added to the sequence, else

138

* null

139

* @throws IOException

140

141

protected SequenceFeature processNucleotideMatch(

142

Map<String, List<String>> attributes, SequenceI seq,

143

String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,

144

boolean relaxedIdMatching) throws IOException

145

{

146

String strand = gffColumns[STRAND_COL];

147

148

149

* (For now) we don't process mappings from reverse complement ; to do

150

* this would require (a) creating a virtual sequence placeholder for

151

* the reverse complement (b) resolving the sequence by its id from some

152

* source (GFF ##FASTA or other) (c) creating the reverse complement

153

* sequence (d) updating the mapping to be to the reverse complement

154

155

if ("-".equals(strand))

156

{

157

jalview.bin.Console.errPrintln(

158

"Skipping mapping from reverse complement as not yet supported");

return null;

}

List<String> targets = attributes.get(TARGET);

163

if (targets == null)

164

{

165

jalview.bin.Console.errPrintln("'Target' missing in GFF");

return null;

}

* Typically we only expect one Target per GFF line, but this can handle

171

* multiple matches, to the same or different sequences (e.g. dna variants)

172

173

for (String target : targets)

174

{

175

176

* Process "seqid start end [strand]"

177

178

String[] tokens = target.split(" ");

179

if (tokens.length < 3)

180

{

181

jalview.bin.Console.errPrintln("Incomplete Target: " + target);

continue;

}

* Locate the mapped sequence in the alignment, or as a

187

* (new or existing) virtual sequence in the newseqs list

188

189

String targetId = findTargetId(tokens[0], attributes);

190

SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,

191

relaxedIdMatching);

192

SequenceI mappedSequence = mappedSequence1;

193

if (mappedSequence == null)

{

continue;

}

* get any existing mapping for these sequences (or start one),

200

* and add this mapped range

201

202

AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);

try

{

int toStart = Integer.parseInt(tokens[1]);

207

int toEnd = Integer.parseInt(tokens[2]);

208

if (tokens.length > 3 && "-".equals(tokens[3]))

209

{

210

// mapping to reverse strand - swap start/end

int temp = toStart;

toStart = toEnd;

toEnd = temp;

}

int fromStart = Integer.parseInt(gffColumns[START_COL]);

217

int fromEnd = Integer.parseInt(gffColumns[END_COL]);

218

MapList mapping = constructMappingFromAlign(fromStart, fromEnd,

219

toStart, toEnd, MappingType.NucleotideToNucleotide);

if (mapping != null)

{

acf.addMap(seq, mappedSequence, mapping);

224

align.addCodonFrame(acf);

225

}

226

} catch (NumberFormatException nfe)

227

{

228

jalview.bin.Console

229

.errPrintln("Invalid start or end in Target " + target);

}

}

SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);

return sf;

}

/**

* Returns the target sequence id extracted from the GFF name/value pairs.

239

* Default (standard behaviour) is the first token for "Target". This may be

240

* overridden where tools report this in a non-standard way.

241

242

* @param target

243

* first token of a "Target" value from GFF column 9, typically

244

* "seqid start end"

245

* @param set

246

* a map with all parsed column 9 attributes

247

* @return

248

249

@SuppressWarnings("unused")

250

protected String findTargetId(String target,

251

Map<String, List<String>> set)

{

return target;

}

/**

* Processes one GFF 'protein_match'; fields of interest are

258

* <ul>

259

* <li>feature group - the database reporting a match e.g. Pfam</li>

260

* <li>Name - the matched entry's accession id in the database</li>

261

* <li>ID - a sequence identifier for the matched region (which may be

262

* appended as FASTA in the GFF file)</li>

* </ul>

* @param set

* parsed GFF column 9 key/value(s)

267

* @param seq

268

* the sequence the GFF feature is on

269

* @param gffColumns

270

* the sequence feature holding GFF data

271

* @param align

272

* the alignment the sequence belongs to, where any new mappings

273

* should be added

274

* @param newseqs

275

* a list of new 'virtual sequences' generated while parsing GFF

276

* @param relaxedIdMatching

277

* if true allow fuzzy search for a matching target sequence

278

* @return the (real or virtual) sequence(s) mapped to by this match

279

* @throws IOException

280

281

protected SequenceFeature processProteinMatch(

282

Map<String, List<String>> set, SequenceI seq, String[] gffColumns,

283

AlignmentI align, List<SequenceI> newseqs,

284

boolean relaxedIdMatching)

285

{

286

// This is currently tailored to InterProScan GFF output:

287

// ID holds the ID of the matched sequence, Target references the

288

// query sequence; this looks wrong, as ID should just be the GFF internal

289

// ID of the GFF feature, while Target would normally reference the matched

290

// sequence.

291

// TODO refactor as needed if other protein-protein GFF varies

292

293

SequenceFeature sf = buildSequenceFeature(gffColumns, set);

294

295

296

* locate the mapped sequence in the alignment, or as a

297

* (new or existing) virtual sequence in the newseqs list

298

299

List<String> targets = set.get(TARGET);

300

if (targets != null)

301

{

302

for (String target : targets)

303

{

304

305

SequenceI mappedSequence1 = findSequence(findTargetId(target, set),

306

align, newseqs, relaxedIdMatching);

307

SequenceI mappedSequence = mappedSequence1;

308

if (mappedSequence == null)

{

continue;

}

* give the mapped sequence a copy of the sequence feature, with

315

* start/end range adjusted

316

317

int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();

318

SequenceFeature sf2 = new SequenceFeature(sf, 1,

319

sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());

320

mappedSequence.addSequenceFeature(sf2);

321

322

323

* add a property to the mapped sequence so that it can eventually be

324

* renamed with its qualified accession id; renaming has to wait until

325

* all sequence reference resolution is complete

326

327

String accessionId = StringUtils

328

.listToDelimitedString(set.get(NAME), ",");

329

if (accessionId.length() > 0)

330

{

331

String database = sf.getType(); // TODO InterProScan only??

332

String qualifiedAccId = database + "|" + accessionId;

333

sf2.setValue(RENAME_TOKEN, qualifiedAccId);

}

* get any existing mapping for these sequences (or start one),

338

* and add this mapped range

339

340

AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);

341

int[] from = new int[] { sf.getBegin(), sf.getEnd() };

342

int[] to = new int[] { 1, sequenceFeatureLength };

343

MapList mapping = new MapList(from, to, 1, 1);

344

345

alco.addMap(seq, mappedSequence, mapping);

346

align.addCodonFrame(alco);

}

}

return sf;

}

/**

* Modifies the default SequenceFeature in order to set the Target sequence id

* as the description

@Override

protected SequenceFeature buildSequenceFeature(String[] gff,

359

int typeColumn, String group,

360

Map<String, List<String>> attributes)

361

{

362

SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,

363

attributes);

364

String desc = getDescription(sf, attributes);

365

if (desc != null)

366

{

367

sf.setDescription(desc);

}

return sf;

}

/**

* Apply heuristic rules to try to get the most useful feature description

* @param sf

* @param attributes

* @return

protected String getDescription(SequenceFeature sf,

380

Map<String, List<String>> attributes)

381

{

382

String desc = null;

383

String target = (String) sf.getValue(TARGET);

384

if (target != null)

385

{

386

desc = target.split(" ")[0];

387

}

388

389

SequenceOntologyI so = SequenceOntologyFactory.getInstance();

390

String type = sf.getType();

391

if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))

392

{

393

394

* Ensembl returns dna variants as 'alleles'

395

396

desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),

",");

}

* extract 'Name' for a transcript (to show gene name)

402

* or an exon (so 'colour by label' shows exon boundaries)

403

404

if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)

405

|| so.isA(type, SequenceOntologyI.TRANSCRIPT)

406

|| so.isA(type, SequenceOntologyI.EXON))

407

{

408

desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");

}

* if the above fails, try ID

if (desc == null)

{

desc = (String) sf.getValue(ID);

}

* and decode comma, equals, semi-colon as required by GFF3 spec

421

422

desc = StringUtils.urlDecode(desc, GFF_ENCODABLE);

return desc;

}

}

Coverage Report

File Gff3Helper.java

Coverage histogram

Code metrics

Classes

Class Gff3Helper

Contributing tests

Contributing tests

Source view