File GffHelperBase.java

Branches:

Statements:

135

Methods:

Classes:

LOC:

574

NCLOC:

312

Total complexity:

Complexity density:

0.39

Statements/Method:

16.88

Methods/Class:

Average method complexity:

6.62

Classes

Class	Line #	Total Statements	Complexity	TOTAL Coverage	Actions
GffHelperBase	44	135	53	0.00%

Class GffHelperBase

Class GffHelperBase	Line # 44	Total Statements 135	Complexity 53	TOTAL Coverage 0.00%
constructMappingFromAlign(int,int,int,int,MappingType) : MapList constructMappingFromAlign(int,int,int,int,MappingType) : MapList	9090	13.013	3.03	0.0 0.00%
trimMapping(int[],int[],int,int) : boolean trimMapping(int[],int[],int,int) : boolean	144144	21.021	8.08	0.0 0.00%
findSequence(String,AlignmentI,List<SequenceI>,boolean) : SequenceI findSequence(String,AlignmentI,List<SequenceI>,boolean) : SequenceI	219219	21.021	11.011	0.0 0.00%
parseNameValuePairs(String,String,char,String) : Map<String, List<String>> parseNameValuePairs(String,String,char,String) : Map<String, List<String>>	297297	23.023	8.08	0.0 0.00%
buildSequenceFeature(String[],Map<String, List<String>>) : SequenceFeature buildSequenceFeature(String[],Map<String, List<String>>) : SequenceFeature	371371	1.01	1.01	0.0 0.00%
buildSequenceFeature(String[],int,String,Map<String, List<String>>) : SequenceFeature buildSequenceFeature(String[],int,String,Map<String, List<String>>) : SequenceFeature	384384	24.024	7.07	0.0 0.00%
parseAttributeMap(String) : Map<String, String> parseAttributeMap(String) : Map<String, String>	470470	28.028	13.013	0.0 0.00%
getMapping(AlignmentI,SequenceI,SequenceI) : AlignedCodonFrame getMapping(AlignmentI,SequenceI,SequenceI) : AlignedCodonFrame	563563	4.04	2.02	0.0 0.00%

Contributing tests

No tests hitting this source file were found.

Source view

* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)

* Copyright (C) $$Year-Rel$$ The Jalview Authors

* This file is part of Jalview.

* Jalview is free software: you can redistribute it and/or

* modify it under the terms of the GNU General Public License

* as published by the Free Software Foundation, either version 3

* of the License, or (at your option) any later version.

* Jalview is distributed in the hope that it will be useful, but

* WITHOUT ANY WARRANTY; without even the implied warranty

* of MERCHANTABILITY or FITNESS FOR A PARTICULAR

* PURPOSE. See the GNU General Public License for more details.

* You should have received a copy of the GNU General Public License

* along with Jalview. If not, see <http://www.gnu.org/licenses/>.

* The Jalview Authors are detailed in the 'AUTHORS' file.

package jalview.io.gff;

import jalview.analysis.SequenceIdMatcher;

import jalview.datamodel.AlignedCodonFrame;

import jalview.datamodel.AlignmentI;

import jalview.datamodel.MappingType;

import jalview.datamodel.SequenceDummy;

import jalview.datamodel.SequenceFeature;

import jalview.datamodel.SequenceI;

import jalview.util.MapList;

import jalview.util.StringUtils;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Map.Entry;

/**

* Base class with common functionality for flavours of GFF handler (GFF2 or

* GFF3)

public abstract class GffHelperBase implements GffHelperI

{

private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: ";

protected static final String COMMA = ",";

protected static final String EQUALS = "=";

protected static final String NOTE = "Note";

* GFF columns 1-9 (zero-indexed):

protected static final int SEQID_COL = 0;

protected static final int SOURCE_COL = 1;

protected static final int TYPE_COL = 2;

protected static final int START_COL = 3;

protected static final int END_COL = 4;

protected static final int SCORE_COL = 5;

protected static final int STRAND_COL = 6;

protected static final int PHASE_COL = 7;

protected static final int ATTRIBUTES_COL = 8;

private AlignmentI lastmatchedAl = null;

private SequenceIdMatcher matcher = null;

/**

* Constructs and returns a mapping, or null if data appear invalid

* @param fromStart

* @param fromEnd

* @param toStart

* @param toEnd

* @param mappingType

* type of mapping (e.g. protein to nucleotide)

* @return

protected MapList constructMappingFromAlign(int fromStart, int fromEnd,

int toStart, int toEnd, MappingType mappingType)

{

int[] from = new int[] { fromStart, fromEnd };

int[] to = new int[] { toStart, toEnd };

* Jalview always models from dna to protein, so switch values if the

* GFF mapping is from protein to dna

100

if (mappingType == MappingType.PeptideToNucleotide)

{

int[] temp = from;

from = to;

to = temp;

mappingType = mappingType.getInverse();

106

}

107

108

int fromRatio = mappingType.getFromRatio();

109

int toRatio = mappingType.getToRatio();

110

111

112

* sanity check that mapped residue counts match

113

* TODO understand why PASA generates such cases...

114

115

if (!trimMapping(from, to, fromRatio, toRatio))

116

{

117

jalview.bin.Console.errPrintln(

118

"Ignoring mapping from " + Arrays.toString(from) + " to "

119

+ Arrays.toString(to) + " as counts don't match!");

return null;

}

* If a codon has an intron gap, there will be contiguous 'toRanges';

125

* this is handled for us by the MapList constructor.

126

* (It is not clear that exonerate ever generates this case)

127

128

129

return new MapList(from, to, fromRatio, toRatio);

}

/**

* Checks that the 'from' and 'to' ranges have equivalent lengths. If not,

134

* tries to trim the end of the longer so they do. Returns true if the

135

* mappings could be made equivalent, else false. Note the range array values

136

* may be modified by this method.

* @param from

* @param to

* @param fromRatio

* @param toRatio

* @return

protected static boolean trimMapping(int[] from, int[] to, int fromRatio,

145

int toRatio)

146

{

147

int fromLength = Math.abs(from[1] - from[0]) + 1;

148

int toLength = Math.abs(to[1] - to[0]) + 1;

149

int fromOverlap = fromLength * toRatio - toLength * fromRatio;

150

if (fromOverlap == 0)

{

return true;

}

if (fromOverlap > 0 && fromOverlap % toRatio == 0)

155

{

156

157

* restrict from range to make them match up

158

* it's kind of arbitrary which end we truncate - here it is the end

159

160

System.err.print(

161

"Truncating mapping from " + Arrays.toString(from) + " to ");

162

if (from[1] > from[0])

163

{

164

from[1] -= fromOverlap / toRatio;

}

else

{

from[1] += fromOverlap / toRatio;

169

}

170

jalview.bin.Console.errPrintln(Arrays.toString(from));

171

return true;

172

}

173

else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)

174

{

175

fromOverlap = -fromOverlap; // > 0

176

177

* restrict to range to make them match up

178

179

System.err.print(

180

"Truncating mapping to " + Arrays.toString(to) + " to ");

181

if (to[1] > to[0])

182

{

183

to[1] -= fromOverlap / fromRatio;

}

else

{

to[1] += fromOverlap / fromRatio;

188

}

189

jalview.bin.Console.errPrintln(Arrays.toString(to));

return true;

}

* Couldn't truncate to an exact match..

return false;

}

/**

* Returns a sequence matching the given id, as follows

201

* <ul>

202

* <li>strict matching is on exact sequence name</li>

203

* <li>relaxed matching allows matching on a token within the sequence name,

204

* or a dbxref</li>

205

* <li>first tries to find a match in the alignment sequences</li>

206

* <li>else tries to find a match in the new sequences already generated while

207

* parsing the features file</li>

208

* <li>else creates a new placeholder sequence, adds it to the new sequences

209

* list, and returns it</li>

* </ul>

* @param seqId

* @param align

* @param newseqs

* @param relaxedIdMatching

* @return

protected SequenceI findSequence(String seqId, AlignmentI align,

220

List<SequenceI> newseqs, boolean relaxedIdMatching)

{

if (seqId == null)

{

return null;

}

SequenceI match = null;

227

if (relaxedIdMatching)

228

{

229

if (lastmatchedAl != align)

230

{

231

lastmatchedAl = align;

232

matcher = new SequenceIdMatcher(align.getSequencesArray());

233

if (newseqs != null)

234

{

235

matcher.addAll(newseqs);

236

}

237

}

238

match = matcher.findIdMatch(seqId);

}

else

{

match = align.findName(seqId, true);

243

if (match == null && newseqs != null)

244

{

245

for (SequenceI m : newseqs)

246

{

247

if (seqId.equals(m.getName()))

{

return m;

}

}

}

}

if (match == null && newseqs != null)

256

{

257

match = new SequenceDummy(seqId);

258

if (relaxedIdMatching)

259

{

260

matcher.addAll(Arrays.asList(new SequenceI[] { match }));

261

}

262

// add dummy sequence to the newseqs list

newseqs.add(match);

}

return match;

}

/**

* Parses the input line to a map of name / value(s) pairs. For example the

* line

* <pre>

* Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal

274

* </pre>

275

276

* if parsed with delimiter=";" and separators {' ', '='} <br>

277

* would return a map with { Notes={Fe=S, Metal}, Method={manual curation,

278

* prediction}, source={Pfam}} <br>

279

280

* This method supports parsing of either GFF2 format (which uses space ' ' as

281

* the name/value delimiter, and allows multiple occurrences of the same

282

* name), or GFF3 format (which uses '=' as the name/value delimiter, and

283

* strictly does not allow repeat occurrences of the same name - but does

284

* allow a comma-separated list of values).

285

* <p>

286

* Returns a (possibly empty) map of lists of values by attribute name.

287

288

* @param text

289

* @param namesDelimiter

290

* the major delimiter between name-value pairs

291

* @param nameValueSeparator

292

* separator used between name and value

293

* @param valuesDelimiter

294

* delimits a list of more than one value

295

* @return

296

297

public static Map<String, List<String>> parseNameValuePairs(String text,

298

String namesDelimiter, char nameValueSeparator,

299

String valuesDelimiter)

300

{

301

Map<String, List<String>> map = new HashMap<>();

302

if (text == null || text.trim().length() == 0)

{

return map;

}

* split by major delimiter (; for GFF3)

309

310

for (String nameValuePair : text.trim().split(namesDelimiter))

311

{

312

nameValuePair = nameValuePair.trim();

313

if (nameValuePair.length() == 0)

{

continue;

}

* find name/value separator (= for GFF3)

320

321

int sepPos = nameValuePair.indexOf(nameValueSeparator);

322

if (sepPos == -1)

323

{

324

// no name=value found

continue;

}

String name = nameValuePair.substring(0, sepPos).trim();

329

String values = nameValuePair.substring(sepPos + 1).trim();

330

if (values.isEmpty())

{

continue;

}

List<String> vals = map.get(name);

336

if (vals == null)

337

{

338

vals = new ArrayList<>();

map.put(name, vals);

}

* if 'values' contains more name/value separators, parse as a map

344

* (nested sub-attribute values)

345

346

if (values.indexOf(nameValueSeparator) != -1)

{

vals.add(values);

}

else

{

for (String val : values.split(valuesDelimiter))

{

vals.add(val);

}

}

}

return map;

}

/**

* Constructs a SequenceFeature from the GFF column data. Subclasses may wish

364

* to call this method then adjust the SequenceFeature depending on the

365

* particular usage of different tools that generate GFF.

* @param gff

* @param attributes

* @return

protected SequenceFeature buildSequenceFeature(String[] gff,

372

Map<String, List<String>> attributes)

373

{

374

return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);

}

/**

* @param gff

* @param typeColumn

* @param group

* @param attributes

* @return

protected SequenceFeature buildSequenceFeature(String[] gff,

385

int typeColumn, String group,

386

Map<String, List<String>> attributes)

{

try

{

int start = Integer.parseInt(gff[START_COL]);

391

int end = Integer.parseInt(gff[END_COL]);

392

393

394

* default 'score' is 0 rather than Float.NaN - see JAL-2554

float score = 0f;

try

{

score = Float.parseFloat(gff[SCORE_COL]);

400

} catch (NumberFormatException nfe)

401

{

402

// e.g. '.' - leave as zero

403

}

404

405

SequenceFeature sf = new SequenceFeature(gff[typeColumn],

406

gff[SOURCE_COL], start, end, score, group);

407

408

sf.setStrand(gff[STRAND_COL]);

409

410

sf.setPhase(gff[PHASE_COL]);

411

412

if (attributes != null)

413

{

414

415

* Add attributes in column 9 to the sequence feature's

416

* 'otherData' table; use Note as a best proxy for description;

417

* decode any encoded comma, equals, semi-colon as per GFF3 spec

418

419

for (Entry<String, List<String>> attr : attributes.entrySet())

420

{

421

String key = attr.getKey();

422

List<String> values = attr.getValue();

423

if (values.size() == 1 && values.get(0).contains(EQUALS))

424

{

425

426

* 'value' is actually nested subattributes as x=a,y=b,z=c

427

428

Map<String, String> valueMap = parseAttributeMap(values.get(0));

429

sf.setValue(key, valueMap);

}

else

{

String csvValues = StringUtils.listToDelimitedString(values,

434

COMMA);

435

csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);

436

sf.setValue(key, csvValues);

437

if (NOTE.equals(key))

438

{

439

sf.setDescription(csvValues);

}

}

}

}

return sf;

} catch (NumberFormatException nfe)

447

{

448

jalview.bin.Console

449

.errPrintln("Invalid number in gff: " + nfe.getMessage());

return null;

}

}

/**

* Parses a (GFF3 format) list of comma-separated key=value pairs into a Map

456

* of {@code key,

457

* value} <br>

458

* An input string like {@code a=b,c,d=e,f=g,h} is parsed to

* <pre>

* a = "b,c"

* d = "e"

* f = "g,h"

* </pre>

* @param s

* @return

protected static Map<String, String> parseAttributeMap(String s)

471

{

472

Map<String, String> map = new HashMap<>();

473

String[] fields = s.split(EQUALS);

* format validation

boolean valid = true;

479

if (fields.length < 2)

480

{

481

482

* need at least A=B here

valid = false;

}

else if (fields[0].isEmpty() || fields[0].contains(COMMA))

487

{

488

489

* A,B=C is not a valid start, nor is =C

valid = false;

}

else

{

for (int i = 1; i < fields.length - 1; i++)

496

{

497

if (fields[i].isEmpty() || !fields[i].contains(COMMA))

498

{

499

500

* intermediate tokens must include value,name

valid = false;

}

}

}

if (!valid)

{

jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);

return map;

}

int i = 0;

while (i < fields.length - 1)

515

{

516

boolean lastPair = i == fields.length - 2;

517

String before = fields[i];

518

String after = fields[i + 1];

519

520

521

* if 'key' looks like a,b,c then the last token is the

522

* key

523

524

String theKey = before.contains(COMMA)

525

? before.substring(before.lastIndexOf(COMMA) + 1)

526

: before;

527

528

theKey = theKey.trim();

529

if (theKey.isEmpty())

530

{

531

jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);

map.clear();

return map;

}

* if 'value' looks like a,b,c then all but the last token is the value,

538

* unless this is the last field (no more = to follow), in which case

539

* all of it makes up the value

540

541

String theValue = after.contains(COMMA) && !lastPair

542

? after.substring(0, after.lastIndexOf(COMMA))

543

: after;

544

map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),

545

StringUtils.urlDecode(theValue, GFF_ENCODABLE));

i += 1;

}

return map;

}

/**

* Returns any existing mapping held on the alignment between the given

554

* dataset sequences, or a new one if none found. This is a convenience method

555

* to facilitate processing multiple GFF lines that make up a single 'spliced'

556

* mapping, by extending the first mapping as the others are read.

* @param align

* @param fromSeq

* @param toSeq

* @return

protected AlignedCodonFrame getMapping(AlignmentI align,

564

SequenceI fromSeq, SequenceI toSeq)

565

{

566

AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);

567

if (acf == null)

568

{

569

acf = new AlignedCodonFrame();

}

return acf;

}

}

Coverage Report

File GffHelperBase.java

Coverage histogram

Code metrics

Classes

Class GffHelperBase

Contributing tests

Source view