diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d3716aa6..017c6814 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,13 +1,21 @@ name: main -on: [push] +on: + push: + branches: + - master + pull_request: + types: + - opened + - reopened + - synchronize jobs: build-and-test: strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 - name: git setup id: git-setup run: | @@ -18,23 +26,22 @@ jobs: - name: conda env run: | - wget -O Mambaforge.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" - curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" - bash Mambaforge.sh -b -p "${HOME}/conda" + wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" + bash Miniforge3.sh -b -p "${HOME}/conda" source "${HOME}/conda/etc/profile.d/conda.sh" - source "${HOME}/conda/etc/profile.d/mamba.sh" which conda conda config --system --add channels defaults conda config --system --add channels bioconda conda config --system --add channels conda-forge conda config --system --set channel_priority strict - mamba create -y -n gffutils-env \ + conda create -y -n gffutils-env \ python=${{ matrix.python-version }} \ bedtools conda activate gffutils-env - python setup.py clean sdist - (cd dist && pip install gffutils-*.tar.gz) + python -m pip install build + python -m build + (cd dist && python -m pip install gffutils-*.tar.gz) cd $TMPDIR python -c "import gffutils; print(gffutils.__file__)" conda deactivate @@ -42,11 +49,10 @@ jobs: - name: run unit tests run: | source "${HOME}/conda/etc/profile.d/conda.sh" - source "${HOME}/conda/etc/profile.d/mamba.sh" - conda activate gffutils-env - pip install pytest hypothesis biopython pybedtools - pytest -v --doctest-modules gffutils + conda install -y bedtools + python -m pip install -e '.[optional,test]' + pytest conda deactivate - name: doctests @@ -61,9 +67,8 @@ jobs: if: ${{ (matrix.python-version != 3.8) }} run: | source "${HOME}/conda/etc/profile.d/conda.sh" - source "${HOME}/conda/etc/profile.d/mamba.sh" - mamba install -y -n gffutils-env --file docs-requirements.txt conda activate gffutils-env + python -m pip install -e '.[docs]' (cd doc && make clean doctest) conda deactivate @@ -72,7 +77,6 @@ jobs: if: ${{ (matrix.python-version != 3.8) }} run: | source "${HOME}/conda/etc/profile.d/conda.sh" - source "${HOME}/conda/etc/profile.d/mamba.sh" conda activate gffutils-env (cd doc && make html) conda deactivate @@ -83,7 +87,6 @@ jobs: --branch gh-pages "https://x-acess-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY" \ /tmp/docs - # clean it out and add newly-built docs rm -rf /tmp/docs/* cp -r doc/build/html/* /tmp/docs @@ -102,7 +105,7 @@ jobs: - name: push artifact if: ${{ (matrix.python-version == 3.9) }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v6 with: name: doc path: /tmp/docs @@ -110,7 +113,7 @@ jobs: - name: push docs to gh-pages branch # Push docs to gh-pages if this test is running on master branch, and # restrict to a single Python version. - if: ${{ (github.ref == 'refs/heads/master') && (matrix.python-version == 3.9) }} + if: ${{ (github.ref == 'refs/heads/master') && (matrix.python-version == 3.12) }} run: | cd /tmp/docs git push "https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY" gh-pages diff --git a/.gitignore b/.gitignore index 29a95722..335fa300 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +env/ *.swo *gfffeature.so *.swp diff --git a/MANIFEST.in b/MANIFEST.in index 06f76dad..ae40da46 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,12 +1,6 @@ include README.rst include requirements.txt include LICENSE -recursive-include docs/source *.rst -recursive-include docs/source *.py -recursive-include docs/source/images * -recursive-include doc/source/_templates * -include docs/Makefile -include docs/make.bat include gffutils/test/data/c_elegans_WS199_ann_gff.txt include gffutils/test/data/c_elegans_WS199_dna_shortened.fa include gffutils/test/data/c_elegans_WS199_shortened_gff.txt diff --git a/doc/source/api.rst b/doc/source/api.rst index 2f9adefe..309a689a 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -117,10 +117,10 @@ Integration with other tools :toctree: autodocs :nosignatures: - gffutils.biopython_integration.to_seqfeature - gffutils.biopython_integration.from_seqfeature - gffutils.pybedtools_integration.tsses - gffutils.pybedtools_integration.to_bedtool + biopython_integration.to_seqfeature + biopython_integration.from_seqfeature + pybedtools_integration.tsses + pybedtools_integration.to_bedtool @@ -131,10 +131,10 @@ Utilities :toctree: autodocs :nosignatures: - gffutils.helpers.asinterval - gffutils.helpers.merge_attributes - gffutils.helpers.sanitize_gff_db - gffutils.helpers.annotate_gff_db - gffutils.helpers.infer_dialect - gffutils.helpers.example_filename - gffutils.inspect.inspect + helpers.asinterval + helpers.merge_attributes + helpers.sanitize_gff_db + helpers.annotate_gff_db + helpers.infer_dialect + helpers.example_filename + inspect.inspect diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index f1f7545b..5c7b503f 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -3,6 +3,52 @@ Change log ========== + +v0.14 +----- + +- If a value contained a semicolon there would be unexpected behavior (reported + in `#212 `__). This is solved + by adding a new entry to the dialect, ``semicolon in quotes```, and running + the necessary regular expression only when inferring dialect, or, if + ``semicolon in quotes`` is ``True``, on every feature. In the latter case, + this can dramatically increase the parsing time, since in Python regular + expressions are relatively slow, but it does correctly parse. Thanks to + @DevangThakkar for the fix. +- While working on that, refactored the attributes parsing to make it clearer + to follow along, and added more tests. The refactoring fixed some subtle bugs + on corner cases: + - Previously, for features with repeated keys, the ``order`` key of dialects + would list the repeated keys each time they appeared (i.e., the list had + duplicates) which could result in undetermined behavior. The ``order`` key + is now unique and only the first occurrence of a repeated key will be added + to the order. + - Previously, the ``ensembl_gtf.txt`` example file had a leading *space* in + front of the attributes. This looks to be an error in the creation of the + example file in the first place, but had previously parsed fine. Now the + parser (correctly) mis-handles it. Since I'm unaware of any cases in the + wild that have a leading space, I actually consider the new parsing, which + complains about the space, to be more correct. + - Added tests to directly inspect the inferred dialects for the test cases. +- Preserve GFF directives when ``create_db()`` imports from a file path, + matching the behavior for string-backed iterators and fixing + `#213 `__. This was due to + a different path through the code when using a `pathlib.Path` object. In + addition to this fix, `pathlib.Path` objects are now converted to `str` + throughout the code base with ``os.fspath`` where appropriate. +- CI, testing, and docs infrastructure updates (miniforge instead of + mambaforge; GitHub Action version bumps; skip biopython test if it's not + installed (`#233 `__); reduce build errors for docs) +- Fix `#224 `__, which was caused + by changes to the ``argh`` package used for the command-line tool. +- Address `#242 `__ (typo in docstring) +- Migrate to using ``pyproject.toml`` for packaging. This changes how versions are calculated + and reported, and removes the need for ``setup.py``. Version is only ever + recorded in ``pyproject.toml``; ``version.py`` gets the installed version or + parses the TOML if not installed; ``setup.py`` just calls ``setup()`` with no + arguments since everything has been migrated to ``pyproject.toml``. + + v0.13 ----- diff --git a/doc/source/conf.py b/doc/source/conf.py index c65c4a28..2b85647c 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -53,5 +53,3 @@ templates_path = ['_templates'] exclude_patterns = [] html_theme = 'sphinx_rtd_theme' -html_static_path = ['_static'] -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] diff --git a/doc/source/dialect.rst b/doc/source/dialect.rst index b02d7c6a..aed018b9 100644 --- a/doc/source/dialect.rst +++ b/doc/source/dialect.rst @@ -38,7 +38,8 @@ A GTF dialect might look like this:: 'multival separator': ',', 'quoted GFF2 values': True, 'repeated keys': False, - 'trailing semicolon': True} + 'trailing semicolon': True, + 'semicolon_in_quotes': False} In contrast, a GFF dialect might look like this:: @@ -49,7 +50,9 @@ In contrast, a GFF dialect might look like this:: 'multival separator': ',', 'quoted GFF2 values': False, 'repeated keys': False, - 'trailing semicolon': False} + 'trailing semicolon': False, + 'semicolon_in_quotes': False} + As other real-world files are brought to the attention of the developers, it's likely that more entries will be added to the dialect. diff --git a/doc/source/examples.rst b/doc/source/examples.rst index 6b631236..54d8e45a 100644 --- a/doc/source/examples.rst +++ b/doc/source/examples.rst @@ -235,7 +235,7 @@ data upon import into the database: ... return x -Now we can supply this tranform function to :func:`create_db`: +Now we can supply this transform function to :func:`create_db`: >>> fn = gffutils.example_filename('ensembl_gtf.txt') >>> db = gffutils.create_db(fn, ":memory:", @@ -643,8 +643,8 @@ attributes to have the same format. To help with this, we can use the >>> dialect = helpers.infer_dialect( ... 'Transcript "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2" ; Prediction_status "Partially_confirmed" ; Gene "WBGene00000138" ; CDS "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2" ; Prediction_status "Partially_confirmed" ; Gene "WBGene00000138"', ... ) ->>> print(dialect) -{'leading semicolon': False, 'trailing semicolon': False, 'quoted GFF2 values': True, 'field separator': ' ; ', 'keyval separator': ' ', 'multival separator': ',', 'fmt': 'gtf', 'repeated keys': True, 'order': ['Transcript', 'WormPep', 'Note', 'Prediction_status', 'Gene', 'CDS', 'WormPep', 'Note', 'Prediction_status', 'Gene']} +>>> print({k: v for k, v in sorted(dialect.items())}) +{'field separator': ' ; ', 'fmt': 'gtf', 'keyval separator': ' ', 'leading semicolon': False, 'multival separator': ',', 'order': ['Transcript', 'WormPep', 'Note', 'Prediction_status', 'Gene', 'CDS'], 'quoted GFF2 values': True, 'repeated keys': True, 'semicolon in quotes': False, 'trailing semicolon': False} >>> db.dialect = dialect diff --git a/doc/source/index.rst b/doc/source/index.rst index 24f0d48c..8bf7cb76 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -6,9 +6,9 @@ Introduction ============ :mod:`gffutils` is a Python package for working with `GFF -`_ and `GTF -`_ files in a hierarchical manner. It -allows operations which would be complicated or time-consuming using +`__ +and `GTF `_ files in a hierarchical manner. +It allows operations which would be complicated or time-consuming using a text-file-only approach. Below is a short demonstration of :mod:`gffutils`. For the full documentation, diff --git a/gffutils/__init__.py b/gffutils/__init__.py index 0cf7853f..0242cb5f 100644 --- a/gffutils/__init__.py +++ b/gffutils/__init__.py @@ -5,3 +5,14 @@ from gffutils.helpers import example_filename from gffutils.exceptions import FeatureNotFoundError, DuplicateIDError from gffutils.version import version as __version__ + +__all__ = [ + "__version__", + "create_db", + "FeatureDB", + "Feature", + "DataIterator", + "example_filename", + "FeatureNotFoundError", + "DuplicateIDError", +] diff --git a/gffutils/constants.py b/gffutils/constants.py index 901e7146..2543e64d 100644 --- a/gffutils/constants.py +++ b/gffutils/constants.py @@ -127,6 +127,12 @@ # vs # ID=001; Name=gene1 "field separator": ";", + # Sometimes there are semicolons inside quotes that break things, e.g., + # + # note "Evidence 1a: Function1, Function2" + # vs + # note "Evidence 1a: Function; PubMedId: 123, 456" + "semicolon in quotes": False, # Usually "=" for GFF3; " " for GTF, e.g., # # gene_id "GENE1" diff --git a/gffutils/contrib/plotting.py b/gffutils/contrib/plotting.py index 55e3c333..bfaa9032 100644 --- a/gffutils/contrib/plotting.py +++ b/gffutils/contrib/plotting.py @@ -1,11 +1,11 @@ +import warnings + from gffutils.helpers import asinterval try: from pybedtools.contrib.plotting import Track except ImportError: - import warnings - - warnings.warn("Please install pybedtools for plotting.") + Track = None class Gene(object): @@ -49,6 +49,9 @@ def __init__( UTRs, CDSs are. Padding is essentially "full" minus the largest height (CDS, 0.9, by default). """ + if Track is None: + warnings.warn("Please install pybedtools for plotting.") + raise ImportError("pybedtools is required for gffutils.contrib.plotting") self.heights = {"transcript": 0.2, "utrs": 0.5, "cds": 0.9, "full": 1.0} self.kwargs = kwargs diff --git a/gffutils/create.py b/gffutils/create.py index e137c1af..6e3fbd24 100644 --- a/gffutils/create.py +++ b/gffutils/create.py @@ -76,6 +76,9 @@ def __init__( Base class for _GFFDBCreator and _GTFDBCreator; see create_db() function for docs """ + if isinstance(dbfn, os.PathLike): + dbfn = os.fspath(dbfn) + self._keep_tempfiles = _keep_tempfiles if force_merge_fields is None: force_merge_fields = [] diff --git a/gffutils/feature.py b/gffutils/feature.py index 6db720ff..e833774c 100644 --- a/gffutils/feature.py +++ b/gffutils/feature.py @@ -1,4 +1,5 @@ from pyfaidx import Fasta +import os import simplejson as json from gffutils import constants from gffutils import helpers @@ -383,6 +384,8 @@ def sequence(self, fasta, use_strand=True): ------- string """ + if isinstance(fasta, os.PathLike): + fasta = os.fspath(fasta) if isinstance(fasta, str): fasta = Fasta(fasta, as_raw=False) diff --git a/gffutils/gffwriter.py b/gffutils/gffwriter.py index c3026927..895cf372 100644 --- a/gffutils/gffwriter.py +++ b/gffutils/gffwriter.py @@ -3,6 +3,7 @@ ## import tempfile import shutil +import os from time import strftime, localtime from gffutils.version import version @@ -33,6 +34,8 @@ class GFFWriter: """ def __init__(self, out, with_header=True, in_place=False): + if isinstance(out, os.PathLike): + out = os.fspath(out) self.out = out self.with_header = with_header self.in_place = in_place diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 234994db..7418d29a 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -498,6 +498,8 @@ def is_gff_db(db_fname): For now, rely on .db extension. """ + if isinstance(db_fname, os.PathLike): + db_fname = os.fspath(db_fname) if not os.path.isfile(db_fname): return False if db_fname.endswith(".db"): @@ -519,6 +521,8 @@ def canonical_transcripts(db, fasta_filename): """ import pyfaidx + if isinstance(fasta_filename, os.PathLike): + fasta_filename = os.fspath(fasta_filename) fasta = pyfaidx.Fasta(fasta_filename, as_raw=False) for gene in db.features_of_type("gene"): @@ -580,6 +584,8 @@ def get_gff_db(gff_fname, ext=".db"): load that. Otherwise, create a named temporary file, serialize the db to that, and return the loaded database. """ + if isinstance(gff_fname, os.PathLike): + gff_fname = os.fspath(gff_fname) if not os.path.isfile(gff_fname): # Not sure how we should deal with errors normally in # gffutils -- Ryan? diff --git a/gffutils/interface.py b/gffutils/interface.py index 9216cd39..74e37069 100644 --- a/gffutils/interface.py +++ b/gffutils/interface.py @@ -161,6 +161,9 @@ def __init__( # db. from gffutils import create + if isinstance(dbfn, os.PathLike): + dbfn = os.fspath(dbfn) + if isinstance(dbfn, create._DBCreator): self.conn = dbfn.conn self.dbfn = dbfn.dbfn @@ -454,27 +457,7 @@ def _relation( completely_within=False, limit=None, ): - - # The following docstring will be included in the parents() and - # children() docstrings to maintain consistency, since they both - # delegate to this method. - """ - Parameters - ---------- - - id : string or a Feature object - - level : None or int - - If `level=None` (default), then return all children regardless - of level. If `level` is an integer, then constrain to just that - level. - {_method_doc} - - Returns - ------- - A generator object that yields :class:`Feature` objects. - """ + """Internal implementation for parent/child relationship queries.""" if isinstance(id, Feature): id = id.id @@ -521,7 +504,22 @@ def children( ): """ Return children of feature `id`. - {_relation_docstring} + + Parameters + ---------- + + id : string or a Feature object + + level : None or int + + If `level=None` (default), then return all related children + regardless of level. If `level` is an integer, then constrain to + just that level. + {_method_doc} + + Returns + ------- + A generator object that yields :class:`Feature` objects. """ return self._relation( id, @@ -547,7 +545,22 @@ def parents( ): """ Return parents of feature `id`. - {_relation_docstring} + + Parameters + ---------- + + id : string or a Feature object + + level : None or int + + If `level=None` (default), then return all related parents + regardless of level. If `level` is an integer, then constrain to + just that level. + {_method_doc} + + Returns + ------- + A generator object that yields :class:`Feature` objects. """ return self._relation( id, @@ -1285,7 +1298,7 @@ def create_introns( with open('tmp.gtf', 'w') as fout: for intron in db.create_introns(**intron_kwargs): - fout.write(str(intron) + "\n") + fout.write(str(intron) + "\\n") db.update(gffutils.DataIterator('tmp.gtf'), **create_kwargs) """ @@ -1999,11 +2012,6 @@ def seqids(self): for (i,) in c: yield i - # Recycle the docs for _relation so they stay consistent between parents() - # and children() - children.__doc__ = children.__doc__.format(_relation_docstring=_relation.__doc__) - parents.__doc__ = parents.__doc__.format(_relation_docstring=_relation.__doc__) - # Add the docs for methods that call helpers.make_query() for method in [parents, children, features_of_type, all_features]: method.__doc__ = method.__doc__.format(_method_doc=_method_doc) diff --git a/gffutils/iterators.py b/gffutils/iterators.py index ed0c8389..bdd27477 100644 --- a/gffutils/iterators.py +++ b/gffutils/iterators.py @@ -279,6 +279,8 @@ def DataIterator( provided, you should probably also use `force_dialect_check=False` and `checklines=0` but this is not enforced. """ + if isinstance(data, os.PathLike): + data = os.fspath(data) if isinstance(data, _BaseIterator): return data diff --git a/gffutils/parser.py b/gffutils/parser.py index 058423ad..4ff740a5 100644 --- a/gffutils/parser.py +++ b/gffutils/parser.py @@ -1,9 +1,8 @@ # Portions copied over from BCBio.GFF.GFFParser import re -import copy import collections -import urllib +from urllib import parse from gffutils import constants from gffutils.exceptions import AttributeStringError @@ -16,7 +15,27 @@ ch.setFormatter(formatter) logger.addHandler(ch) -gff3_kw_pat = re.compile(r"\w+=") +# Regex for each separator that will be tested +quoted_semicolon_patterns = dict() + +for sep in (" ; ", "; ", ";"): + quoted_semicolon_patterns[sep] = re.compile( + rf""" + {re.escape(sep)} # The separator we're considering (escaped for VERBOSE mode) + (?= # Positive lookahead: does remaining content match? + (?: # Start non-capturing group + [^"] # Either: match any character that is NOT a quote + | # OR + "[^"]*" # Match a complete quoted string, specifically: + # - opening quote ", followed by + # - zero or more non-quote characters [^"]* + # - followed by closing quote " + )* # Repeat the above pattern zero or more times + $ # Until we reach the end of the string + ) # End of lookahead + """, + re.VERBOSE, + ) # Encoding/decoding notes # ----------------------- @@ -50,9 +69,9 @@ # # See also issue #98. # -# Note that spaces are NOT encoded. Some GFF files have spaces encoded; in -# these cases round-trip invariance will not hold since the %20 will be decoded -# but not re-encoded. +# Note that spaces are NOT supposed to be encoded. Yet some GFF files have +# spaces encoded anyway; in these cases round-trip invariance will not hold +# since the %20 will be decoded but not re-encoded. _to_quote = "\n\t\r%;=&," _to_quote += "".join([chr(i) for i in range(32)]) _to_quote += chr(127) @@ -74,6 +93,235 @@ def __missing__(self, b): quoter = Quoter() +def _split_keyvals(keyval_str, dialect=None): + """ + Dialect detection requires partially parsing the attributes. + """ + from gffutils import feature + + quals = feature.dict_class() + + if not keyval_str: + return quals, dialect + + infer_dialect = False + if dialect is None: + infer_dialect = True + dialect = {} + + # No known cases yet of different multival separator + dialect["multival separator"] = "," + + # Detection for these dialect fields can work on the full attribute + # string. Other detection needs to wait until we've further parsed the + # attributes. + if infer_dialect: + dialect["trailing semicolon"] = keyval_str[-1] == ";" + dialect["leading semicolon"] = keyval_str[0] == ";" + semicolon_in_quotes = False + sep = None + for sep in (" ; ", "; ", ";"): + parts = keyval_str.split(sep) + if len(parts) > 1: + # If naive split differs from more expensive regex, we infer there was + # a semicolon within quoted value and we'll have to use the expensive + # method later + parts_regex = re.split(quoted_semicolon_patterns[sep], keyval_str) + if parts != parts_regex: + semicolon_in_quotes = True + break + dialect["semicolon in quotes"] = semicolon_in_quotes + dialect["field separator"] = sep + + if dialect["trailing semicolon"]: + keyval_str = keyval_str.rstrip(";") + + if dialect["leading semicolon"]: + keyval_str = keyval_str.lstrip(";") + + if dialect["semicolon in quotes"]: + parts = re.split( + quoted_semicolon_patterns[dialect["field separator"]], keyval_str + ) + else: + parts = keyval_str.split(dialect["field separator"]) + + # The next stage of dialect inference works on the 'parts' -- unsplit + # keyval pairs -- like: + # + # parts = ["ID=001", "Name=gene1"] + # + # or + # + # parts = ["gene_id ENSG001", "gene_biotype protein_coding"] + # + if infer_dialect: + dialect["fmt"] = "gff3" + + # Note: so far, have not found cases where we need to check more than + # the first item + if "=" in parts[0]: + dialect["fmt"] = "gff3" + dialect["keyval separator"] = "=" + else: + dialect["fmt"] = "gtf" + dialect["keyval separator"] = " " + + # Now we split + # + # parts = ["ID=001", "Name=gene1"] + # + # into + # + # key_val_tuples = [("ID", "001"), ("Name", "gene1")] + # + # in a dialect-dependent manner. + kvsep = dialect["keyval separator"] + key_val_tuples = [p.split(kvsep) for p in parts] + + # With the split keys we can detect whether any are repeated + if infer_dialect: + keys = [i[0] for i in key_val_tuples] + dialect["repeated keys"] = len(keys) != len(set(keys)) + + # For dialect detection, this will help figure out if there is + # inconsistent quoting across values. It will only be used in the loop + # below if infer_dialect is True + quoted_values = [] + + # Now work splitting the keys if needed. + for i in key_val_tuples: + + if len(i) == 2: + # Easy, on-spec case + key, val = i + + elif len(i) == 1: + # By convention, no value becomes an empty string, e.g. when done + # parsing, + # + # "ID=001;is_gene;" + # + # will end up as: + # + # {"ID": "001", "is_gene": ""} + key = i[0] + val = "" + + else: + # Multiple *spaces* within quoted values are joined back together + # without requiring a regex, in contrast to when there's *field* + # separator like a semicolon in the values. + # + # That is: + # + # attributes = 'gene_description "an important gene"; gene_id "g001"' + # + # when split on spaces, becomes + # + # key_val_tuples = [("gene_description", "an", "important", "gene"), ("gene_id", "g001")] + # + # so here when we only keep the first token as a key, that first + # key/val pair will become: + # + # { + # "gene_description": ["an important gene"], + # "gene_id": ["g001"], + # } + # + # Another pathological case, this time for GFF3: + # + # Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126 + # + # will become the following: + # + # { + # "Alias": ["SGN-M1347"], + # "ID": ["T0028"], + # "Note": ["marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126"], + # } + # + key = i[0] + val = kvsep.join(i[1:]) + + # By convention all values are lists, even if there's only one value + # (or even no values) + if key not in quals: + quals[key] = [] + + # This will run on every value, accumulating in quoted_values to check + # later for consistency + if infer_dialect: + quoted = len(val) > 0 and val[0] == '"' and val[-1] == '"' + quoted_values.append(quoted) + dialect["quoted GFF2 values"] = quoted + + if dialect["quoted GFF2 values"] and val: + val = val.strip('"') + + if val: + # For repeated keys dialect, don't split on an internal comma. That is, + # + # attributes = 'db_xref="g01,g02"; db_xref="XYZ"' + # + # becomes: + # + # { + # "db_xref": ["g01,g02", "XYZ"] + # } + # + if dialect.get("repeated keys"): + quals[key].append(val) + + # Otherwise, split but only if it's a comma without a space. So: + # + # attributes = 'db_xref="g01,g02"' + # + # becomes + # { + # "db_xref": ["g01", "g02"] + # } + # but + # + # attributes = 'description="kinase, subunit 1"' + # ^ note the space here + # becomes + # { + # "description": ["kinase, subunit 1"] + # } + # + else: + # E.g. the "kinase, subunit 1" example above + if ", " in val: + quals[key].append(val) + else: + quals[key].extend(val.split(",")) + + # If there was inconsistent quoting, we fall back to "not quoted" so + # as to avoid incorrectly stripping off first and last quotes. + if infer_dialect and len(set(quoted_values)) > 1: + # Prior behavior was to use whatever the first value used + dialect["quoted GFF2 values"] = quoted_values[0] + + # Though there could be an argument for considering quotes in mixed + # cases to be part of the string, though technically they should be + # %-encoded if so. + # dialect["quoted GFF2 values"] = False + + # Handle unquoting of %-encoded values + if not constants.ignore_url_escape_characters and dialect["fmt"] == "gff3": + for key, vals in quals.items(): + unquoted = [parse.unquote(v) for v in vals] + quals[key] = unquoted + + # Now that we're not supporting old Python versions we can rely on dict + # insertion order + if infer_dialect: + dialect["order"] = list(quals.keys()) + + return quals, dialect + + def _reconstruct(keyvals, dialect, keep_order=False, sort_attribute_values=False): """ Reconstructs the original attributes string according to the dialect. @@ -156,6 +404,20 @@ def sort_key(x): part = key else: if dialect["fmt"] == "gtf": + # By convention, GTF attributes with no value are reconstructed + # with an empty string. E.g.: + # 'gene_id "gene1"; is_gene;' + # + # becomes + # + # { + # "gene_id": "gene1", + # "is_gene": "" + # } + # + # and is printed as: + # + # 'gene_id "gene1"; is_gene "";' part = dialect["keyval separator"].join([key, '""']) else: part = key @@ -169,207 +431,3 @@ def sort_key(x): parts_str += ";" return parts_str - - -# TODO: -# Cythonize -- profiling shows that the bulk of the time is spent on this -# function... -def _split_keyvals(keyval_str, dialect=None): - """ - Given the string attributes field of a GFF-like line, split it into an - attributes dictionary and a "dialect" dictionary which contains information - needed to reconstruct the original string. - - Lots of logic here to handle all the corner cases. - - If `dialect` is None, then do all the logic to infer a dialect from this - attribute string. - - Otherwise, use the provided dialect (and return it at the end). - """ - - def _unquote_quals(quals, dialect): - """ - Handles the unquoting (decoding) of percent-encoded characters. - - See notes on encoding/decoding above. - """ - if not constants.ignore_url_escape_characters and dialect["fmt"] == "gff3": - for key, vals in quals.items(): - unquoted = [urllib.parse.unquote(v) for v in vals] - quals[key] = unquoted - return quals - - infer_dialect = False - if dialect is None: - # Make a copy of default dialect so it can be modified as needed - dialect = copy.copy(constants.dialect) - infer_dialect = True - from gffutils import feature - - quals = feature.dict_class() - if not keyval_str: - return quals, dialect - - # If a dialect was provided, then use that directly. - if not infer_dialect: - if dialect["trailing semicolon"]: - keyval_str = keyval_str.rstrip(";") - - parts = keyval_str.split(dialect["field separator"]) - - kvsep = dialect["keyval separator"] - if dialect["leading semicolon"]: - pieces = [] - for p in parts: - if p and p[0] == ";": - p = p[1:] - pieces.append(p.strip().split(kvsep)) - key_vals = [(p[0], " ".join(p[1:])) for p in pieces] - - if dialect["fmt"] == "gff3": - key_vals = [p.split(kvsep) for p in parts] - else: - leadingsemicolon = dialect["leading semicolon"] - pieces = [] - for i, p in enumerate(parts): - if i == 0 and leadingsemicolon: - p = p[1:] - pieces.append(p.strip().split(kvsep)) - key_vals = [(p[0], " ".join(p[1:])) for p in pieces] - - quoted = dialect["quoted GFF2 values"] - for item in key_vals: - # Easy if it follows spec - if len(item) == 2: - key, val = item - - # Only key provided? - elif len(item) == 1: - key = item[0] - val = "" - - else: - key = item[0] - val = dialect["keyval separator"].join(item[1:]) - - try: - quals[key] - except KeyError: - quals[key] = [] - - if quoted: - if len(val) > 0 and val[0] == '"' and val[-1] == '"': - val = val[1:-1] - - if val: - # TODO: if there are extra commas for a value, just use empty - # strings - # quals[key].extend([v for v in val.split(',') if v]) - vals = val.split(",") - quals[key].extend(vals) - - quals = _unquote_quals(quals, dialect) - return quals, dialect - - # If we got here, then we need to infer the dialect.... - # - # Reset the order to an empty list so that it will only be populated with - # keys that are found in the file. - dialect["order"] = [] - - # ensembl GTF has trailing semicolon - if keyval_str[-1] == ";": - keyval_str = keyval_str[:-1] - dialect["trailing semicolon"] = True - - # GFF2/GTF has a semicolon with at least one space after it. - # Spaces can be on both sides (e.g. wormbase) - # GFF3 works with no spaces. - # So split on the first one we can recognize... - for sep in (" ; ", "; ", ";"): - parts = keyval_str.split(sep) - if len(parts) > 1: - dialect["field separator"] = sep - break - - # Is it GFF3? They have key-vals separated by "=" - if gff3_kw_pat.match(parts[0]): - key_vals = [p.split("=") for p in parts] - dialect["fmt"] = "gff3" - dialect["keyval separator"] = "=" - - # Otherwise, key-vals separated by space. Key is first item. - else: - dialect["keyval separator"] = " " - pieces = [] - for p in parts: - # Fix misplaced semicolons in keys in some GFF2 files - if p and p[0] == ";": - p = p[1:] - dialect["leading semicolon"] = True - pieces.append(p.strip().split(" ")) - key_vals = [(p[0], " ".join(p[1:])) for p in pieces] - - for item in key_vals: - - # Easy if it follows spec - if len(item) == 2: - key, val = item - - # Only key provided? - elif len(item) == 1: - key = item[0] - val = "" - - # Pathological cases where values of a key have within them the key-val - # separator, e.g., - # Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126 - # ^ ^ - else: - key = item[0] - val = dialect["keyval separator"].join(item[1:]) - - # Is the key already in there? - if key in quals: - dialect["repeated keys"] = True - else: - quals[key] = [] - - # Remove quotes in GFF2 - if len(val) > 0 and val[0] == '"' and val[-1] == '"': - val = val[1:-1] - dialect["quoted GFF2 values"] = True - if val: - - # TODO: if there are extra commas for a value, just use empty - # strings - # quals[key].extend([v for v in val.split(',') if v]) - - # See issue #198, where commas within a description can incorrectly - # cause the dialect inference to conclude that there are not - # repeated keys. - # - # More description in PR #208. - if dialect["repeated keys"]: - quals[key].append(val) - else: - vals = val.split(",") - - # If anything starts with a leading space, then we infer that - # it was part of a description or some other typographical - # interpretation, not a character to split multiple vals on -- - # and append the original val rather than the split vals. - if any([i[0] == " " for i in vals if i]): - quals[key].append(val) - else: - quals[key].extend(vals) - - # keep track of the order of keys - dialect["order"].append(key) - - if (dialect["keyval separator"] == " ") and (dialect["quoted GFF2 values"]): - dialect["fmt"] = "gtf" - - quals = _unquote_quals(quals, dialect) - return quals, dialect diff --git a/gffutils/pybedtools_integration.py b/gffutils/pybedtools_integration.py index 5c5c2b90..e01e4911 100644 --- a/gffutils/pybedtools_integration.py +++ b/gffutils/pybedtools_integration.py @@ -113,7 +113,7 @@ def tsses( if they overlap (as in the first two): - >>> print(tsses(db)) # doctest: +NORMALIZE_WHITESPACE + >>> print(gffutils.pybedtools_integration.tsses(db)) # doctest: +NORMALIZE_WHITESPACE chr2L gffutils_derived transcript_TSS 7529 7529 . + . gene_id "FBgn0031208"; transcript_id "FBtr0300689"; chr2L gffutils_derived transcript_TSS 7529 7529 . + . gene_id "FBgn0031208"; transcript_id "FBtr0300690"; chr2L gffutils_derived transcript_TSS 11000 11000 . - . gene_id "Fk_gene_1"; transcript_id "transcript_Fk_gene_1"; @@ -124,7 +124,7 @@ def tsses( Default merging, showing the first two TSSes merged and reported as a single unique TSS for the gene. Note the conversion to BED: - >>> x = tsses(db, merge_overlapping=True) + >>> x = gffutils.pybedtools_integration.tsses(db, merge_overlapping=True) >>> print(x) # doctest: +NORMALIZE_WHITESPACE chr2L 7528 7529 FBgn0031208 . + chr2L 10999 11000 Fk_gene_1 . - @@ -135,7 +135,7 @@ def tsses( be easier to parse than the original GTF or GFF file. With no merging specified, we must add `as_bed6=True` to see the names in BED format. - >>> x = tsses(db, attrs=['gene_id', 'transcript_id'], as_bed6=True) + >>> x = gffutils.pybedtools_integration.tsses(db, attrs=['gene_id', 'transcript_id'], as_bed6=True) >>> print(x) # doctest: +NORMALIZE_WHITESPACE chr2L 7528 7529 FBgn0031208:FBtr0300689 . + chr2L 7528 7529 FBgn0031208:FBtr0300690 . + @@ -145,7 +145,7 @@ def tsses( Use a 3kb merge distance so the last 2 features are merged together: - >>> x = tsses(db, merge_overlapping=True, merge_kwargs=dict(d=3000)) + >>> x = gffutils.pybedtools_integration.tsses(db, merge_overlapping=True, merge_kwargs=dict(d=3000)) >>> print(x) # doctest: +NORMALIZE_WHITESPACE chr2L 7528 7529 FBgn0031208 . + chr2L 10999 12500 Fk_gene_1,Fk_gene_2 . - @@ -154,7 +154,7 @@ def tsses( The set of unique TSSes for each gene, +1kb upstream and 500bp downstream: - >>> x = tsses(db, merge_overlapping=True) + >>> x = gffutils.pybedtools_integration.tsses(db, merge_overlapping=True) >>> x = x.slop(l=1000, r=500, s=True, genome='dm3') >>> print(x) # doctest: +NORMALIZE_WHITESPACE chr2L 6528 8029 FBgn0031208 . + diff --git a/gffutils/scripts/gffutils-cli b/gffutils/scripts/gffutils-cli index 051b76d5..70a882b5 100755 --- a/gffutils/scripts/gffutils-cli +++ b/gffutils/scripts/gffutils-cli @@ -76,7 +76,7 @@ def fetch(db, ids): (like grep -v)''') @arg('--exclude-self', help='''Use this to suppress reporting the IDs you've provided.''') -def children(db, ids, limit=None, exclude=None, exclude_self=False): +def children(db, ids, *, limit=None, exclude=None, exclude_self=False): """ Fetch children from the database according to ID. """ @@ -110,7 +110,7 @@ def children(db, ids, limit=None, exclude=None, exclude_self=False): (like grep -v)''') @arg('--exclude-self', help='''Use this to suppress reporting the IDs you've provided.''') -def parents(db, ids, limit=None, exclude=None, exclude_self=False): +def parents(db, ids, *, limit=None, exclude=None, exclude_self=False): """ Fetch parents from the database according to ID. """ @@ -167,7 +167,7 @@ def common(db): @arg('--disable-infer-transcripts', help='''Disable inferring of transcript extents for GTF files. Use this if your GTF file already has "transcript" featuretypes''') -def create(filename, output=None, force=False, quiet=False, merge="merge", +def create(filename, *, output=None, force=False, quiet=False, merge="merge", disable_infer_genes=False, disable_infer_transcripts=False): """ Create a database. @@ -198,7 +198,7 @@ def clean(filename): @arg('--in-place', help='''Sanitize file in-place: overwrites current file with sanitized version.''') -def sanitize(filename, +def sanitize(filename, *, in_memory=True, in_place=False): """ @@ -225,7 +225,7 @@ def sanitize(filename, @arg('filename', help='''GFF or GTF file to use.''') @arg('--in-place', help='''Remove duplicates in place (overwrite current file.)''') -def rmdups(filename, in_place=False): +def rmdups(filename, *, in_place=False): """ Remove duplicates from a GFF file. """ @@ -278,7 +278,7 @@ def convert(filename): @arg('--featuretype', help='''Restrict to a particular featuretype. This can be faster than doing a grep on the output, since it restricts the search space in the database''') -def search(db, text, featuretype=None): +def search(db, text, *, featuretype=None): """ Search the attributes. """ diff --git a/gffutils/test/attr_test_cases.py b/gffutils/test/attr_test_cases.py index b08afbe3..9ee1de96 100644 --- a/gffutils/test/attr_test_cases.py +++ b/gffutils/test/attr_test_cases.py @@ -15,36 +15,130 @@ """ + attrs = [ + dict( + str="ID=001;Name=gene1", + attrs={ + "ID": ["001"], + "Name": ["gene1"], + }, + ok=None, + dialect_mods={"order": ["ID", "Name"]}, + ), + dict( + str="ID=001;Name=gene1;", + attrs={ + "ID": ["001"], + "Name": ["gene1"], + }, + ok=None, + dialect_mods={"trailing semicolon": True, "order": ["ID", "Name"]}, + ), + dict( + str="ID=001; Name=gene1;", + attrs={ + "ID": ["001"], + "Name": ["gene1"], + }, + ok=None, + dialect_mods={ + "trailing semicolon": True, + "field separator": "; ", + "order": ["ID", "Name"], + }, + ), + dict( + str='ID="001"', + attrs={"ID": ["001"]}, + ok=None, + dialect_mods={ + "quoted GFF2 values": True, + "order": ["ID"], + }, + ), + dict( + str='ID="001"; Name="gene1"; types="a,b,c"', + attrs={"ID": ["001"], "Name": ["gene1"], "types": ["a", "b", "c"]}, + ok=None, + dialect_mods={ + "quoted GFF2 values": True, + "field separator": "; ", + "order": ["ID", "Name", "types"], + }, + ), + dict( + str='ID="001"; Name="gene1"; types="a"; types="b"; types="c"', + attrs={"ID": ["001"], "Name": ["gene1"], "types": ["a", "b", "c"]}, + ok=None, + dialect_mods={ + "quoted GFF2 values": True, + "field separator": "; ", + "repeated keys": True, + "order": ["ID", "Name", "types"], + }, + ), + dict( + str="Name=gene1;ID=001", + attrs={"Name": ["gene1"], "ID": ["001"]}, + ok=None, + dialect_mods={"order": ["Name", "ID"]}, + ), + dict( + str='gene_id "001";gene_name "gene1"', + attrs={"gene_id": ["001"], "gene_name": ["gene1"]}, + ok=None, + dialect_mods={ + "fmt": "gtf", + "quoted GFF2 values": True, + "keyval separator": " ", + "order": ["gene_id", "gene_name"], + }, + ), # c_elegans_WS199_shortened_gff.txt - ( - "count=1;gene=amx-2;sequence=SAGE:ggcagagtcttttggca;" "transcript=B0019.1", - { + dict( + str="count=1;gene=amx-2;sequence=SAGE:ggcagagtcttttggca;transcript=B0019.1", + attrs={ "count": ["1"], "gene": ["amx-2"], "sequence": ["SAGE:ggcagagtcttttggca"], "transcript": ["B0019.1"], }, - None, + ok=None, + dialect_mods={"order": ["count", "gene", "sequence", "transcript"]}, ), # ensembl_gtf.txt - ( - 'gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; ' - 'gene_name "Y74C9A.6"; transcript_name "NR_001477.2";', - { + dict( + str=( + 'gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; gene_name "Y74C9A.6"; transcript_name "NR_001477.2";' + ), + attrs={ "gene_id": ["Y74C9A.6"], "transcript_id": ["Y74C9A.6"], "exon_number": ["1"], "gene_name": ["Y74C9A.6"], "transcript_name": ["NR_001477.2"], }, - None, + ok=None, + dialect_mods={ + "trailing semicolon": True, + "fmt": "gtf", + "keyval separator": " ", + "field separator": "; ", + "quoted GFF2 values": True, + "order": [ + "gene_id", + "transcript_id", + "exon_number", + "gene_name", + "transcript_name", + ], + }, ), # F3-unique-3.v2.gff - ( - "g=A3233312322232122211;i=1;p=1.000;q=23,12,18,17,10,24,19,14,27,9,23" - ",9,16,20,11,7,8,4,4,14;u=0,0,0,1", - { + dict( + str="g=A3233312322232122211;i=1;p=1.000;q=23,12,18,17,10,24,19,14,27,9,23,9,16,20,11,7,8,4,4,14;u=0,0,0,1", + attrs={ "g": ["A3233312322232122211"], "i": ["1"], "p": ["1.000"], @@ -72,20 +166,27 @@ ], "u": ["0", "0", "0", "1"], }, - None, + ok=None, + dialect_mods={"order": ["g", "i", "p", "q", "u"]}, ), # glimmer_nokeyval.gff3 - ( - "ID=GL0000006;Name=GL0000006;Lack 3'-end;", - {"ID": ["GL0000006"], "Name": ["GL0000006"], "Lack 3'-end": []}, - None, + dict( + str="ID=GL0000006;Name=GL0000006;Lack 3'-end;", + attrs={"ID": ["GL0000006"], "Name": ["GL0000006"], "Lack 3'-end": []}, + ok=None, + dialect_mods={ + "order": ["ID", "Name", "Lack 3'-end"], + "trailing semicolon": True, + }, ), # hybrid1.gff3 - ( - "ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:" - "A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:" - "SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1", - { + dict( + str=( + "ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:" + "A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:" + "SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1" + ), + attrs={ "ID": ["A00469"], "Dbxref": [ "AFFX-U133:205840_x_at", @@ -99,9 +200,10 @@ "Note": ["growth hormone 1"], "Alias": ["GH1"], }, - "ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:" + ok="ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:" "A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:" "SOMA_HUMAN;Note=growth hormone 1;Alias=GH1", + dialect_mods={"order": ["ID", "Dbxref", "Note", "Alias"]}, ), # jgi_gff2.txt # @@ -109,19 +211,30 @@ # quoted but string values are. Only way to make this be invariant is to # keep track of the "flavor" of each attribute; not sure it's worth the # effort / processing time. - ( - 'name "fgenesh1_pg.C_chr_1000007"; transcriptId 873', - {"name": ["fgenesh1_pg.C_chr_1000007"], "transcriptId": ["873"]}, - 'name "fgenesh1_pg.C_chr_1000007"; transcriptId "873"', + dict( + str='name "fgenesh1_pg.C_chr_1000007"; transcriptId 873', + attrs={"name": ["fgenesh1_pg.C_chr_1000007"], "transcriptId": ["873"]}, + ok='name "fgenesh1_pg.C_chr_1000007"; transcriptId "873"', + dialect_mods={ + "order": ["name", "transcriptId"], + "quoted GFF2 values": True, + "keyval separator": " ", + "fmt": "gtf", + "field separator": "; ", + }, ), # mouse_extra_comma.gff3: extra comma line # # Note extra empty string in the dictionary's "Parent" field. # - ( - "Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,", - {"Name": ["CDS:NC_000083.5:LOC100040603"], "Parent": ["XM_001475631.1", ""]}, - None, + dict( + str="Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,", + attrs={ + "Name": ["CDS:NC_000083.5:LOC100040603"], + "Parent": ["XM_001475631.1", ""], + }, + ok=None, + dialect_mods={"order": ["Name", "Parent"]}, ), # mouse_extra_comma.gff3 # @@ -135,20 +248,23 @@ # # In both cases, the dictionary entry is simply an empty list; it's just in # the reconstruction where things get tricky. - ( - "ID=;Parent=XM_001475631.1", - {"ID": [], "Parent": ["XM_001475631.1"]}, - "ID;Parent=XM_001475631.1", + dict( + str="ID=;Parent=XM_001475631.1", + attrs={"ID": [], "Parent": ["XM_001475631.1"]}, + ok="ID;Parent=XM_001475631.1", + dialect_mods={"order": ["ID", "Parent"]}, ), # ncbi_gff3.txt - ( - "ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;" - "locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20mat" - "ch%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20prote" - "in%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;p" - "rotein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;" - "exon_number=1", - { + dict( + str=( + "ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;" + "locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20mat" + "ch%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20prote" + "in%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;p" + "rotein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;" + "exon_number=1" + ), + attrs={ "ID": ["NC_008596.1:speB:unknown_transcript_1"], "Parent": ["NC_008596.1:speB"], "locus_tag": ["MSMEG_1072"], @@ -164,18 +280,39 @@ "db_xref": ["GI:118469242", "GeneID:4535378"], "exon_number": ["1"], }, - "ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;" + ok="ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;" "locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified by mat" "ch to protein family HMM PF00491%3B match to prote" "in family HMM TIGR01230;transl_table=11;product=agmatinase;p" "rotein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;" "exon_number=1", + dialect_mods={ + "order": [ + "ID", + "Parent", + "locus_tag", + "EC_number", + "note", + "transl_table", + "product", + "protein_id", + "db_xref", + "exon_number", + ], + "repeated keys": True, + }, ), # wormbase_gff2_alt.txt # - ( - 'CDS "cr01.sctg102.wum.2.1"', - {"CDS": ["cr01.sctg102.wum.2.1"]}, - None, + dict( + str='CDS "cr01.sctg102.wum.2.1"', + attrs={"CDS": ["cr01.sctg102.wum.2.1"]}, + ok=None, + dialect_mods={ + "order": ["CDS"], + "quoted GFF2 values": True, + "keyval separator": " ", + "fmt": "gtf", + }, ), ] diff --git a/gffutils/test/data/ensembl_gtf.txt b/gffutils/test/data/ensembl_gtf.txt index f54f8fdd..88de6d51 100644 --- a/gffutils/test/data/ensembl_gtf.txt +++ b/gffutils/test/data/ensembl_gtf.txt @@ -1,33 +1,33 @@ -I snoRNA exon 3747 3909 . - . gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; gene_name "Y74C9A.6"; transcript_name "NR_001477.2"; -I protein_coding exon 12764812 12764949 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12764812 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding start_codon 12764935 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding exon 12764291 12764471 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12764291 12764471 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12763979 12764102 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12763979 12764102 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12763729 12763882 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12763729 12763882 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12763448 12763655 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12763448 12763655 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12763112 12763249 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12763112 12763249 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12762648 12762806 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12762648 12762806 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12762127 12762268 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12762127 12762268 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12761799 12761953 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12761799 12761953 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12761172 12761516 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12761172 12761516 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12760834 12760904 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12760834 12760904 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12760365 12760494 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12760365 12760494 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12760227 12760319 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12760227 12760319 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12759949 12760013 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12759949 12760013 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12759579 12759828 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12759748 12759828 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding stop_codon 12759745 12759747 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; +I snoRNA exon 3747 3909 . - . gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; gene_name "Y74C9A.6"; transcript_name "NR_001477.2"; +I protein_coding exon 12764812 12764949 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12764812 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding start_codon 12764935 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding exon 12764291 12764471 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12764291 12764471 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12763979 12764102 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12763979 12764102 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12763729 12763882 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12763729 12763882 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12763448 12763655 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12763448 12763655 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12763112 12763249 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12763112 12763249 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12762648 12762806 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12762648 12762806 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12762127 12762268 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12762127 12762268 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12761799 12761953 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12761799 12761953 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12761172 12761516 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12761172 12761516 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12760834 12760904 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12760834 12760904 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12760365 12760494 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12760365 12760494 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12760227 12760319 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12760227 12760319 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12759949 12760013 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12759949 12760013 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12759579 12759828 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12759748 12759828 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding stop_codon 12759745 12759747 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; diff --git a/gffutils/test/parser_test.py b/gffutils/test/parser_test.py index ed578a7e..68b30481 100644 --- a/gffutils/test/parser_test.py +++ b/gffutils/test/parser_test.py @@ -53,17 +53,32 @@ def test_attrs_OK(item): (see attr_test_cases.py for details); `acceptable_reconstruction` handles those. """ - attr_str, attr_dict, acceptable_reconstruction = item - result, dialect = parser._split_keyvals(attr_str) + attr_str = item["str"] + attr_dict = item["attrs"] + acceptable_reconstruction = item["ok"] + dialect_mods = item["dialect_mods"] + + result, inferred_dialect = parser._split_keyvals(attr_str) result = dict(result) assert result == attr_dict, result - reconstructed = parser._reconstruct(result, dialect, keep_order=True) + reconstructed = parser._reconstruct(result, inferred_dialect, keep_order=True) if acceptable_reconstruction: assert reconstructed == acceptable_reconstruction, reconstructed else: assert reconstructed == attr_str, reconstructed + # Get the default dialect for comparison, and update it with any diffs + # indicated by the test case + default_dialect = constants.dialect.copy() + default_dialect.update(dialect_mods) + + print(inferred_dialect) + print(dialect_mods) + print(attr_str) + assert default_dialect == inferred_dialect + + def parser_smoke_test(): """ @@ -75,7 +90,7 @@ def parser_smoke_test(): parser.logger.setLevel(logging.CRITICAL) for filename in TEST_FILENAMES: p = iterators._FileIterator(filename) - for i in p: + for _ in p: continue @@ -93,7 +108,8 @@ def test_empty_recontruct(): def test_empty_split_keyvals(): attrs, dialect = parser._split_keyvals(keyval_str=None) assert attrs == feature.dict_class() - assert dialect == constants.dialect + # assert dialect == constants.dialect + assert dialect is None def test_repeated_keys_conflict(): @@ -115,7 +131,7 @@ def test_parser_from_string(): # # _StringIterator has been removed and is instead handled by DataIterator # creating a temp file and returning a _FileIterator. - return True + pass def test_valid_line_count(): diff --git a/gffutils/test/test_1.py b/gffutils/test/test_1.py index 2b88cc04..1780d996 100644 --- a/gffutils/test/test_1.py +++ b/gffutils/test/test_1.py @@ -14,10 +14,7 @@ import tempfile import http.server as SimpleHTTPServer -if sys.version_info.major == 3: - import socketserver as SocketServer -else: - import SocketServer +import socketserver as SocketServer import multiprocessing import json @@ -482,58 +479,51 @@ def test_sanitize_gff(): print("Sanitized GFF successfully.") -def test_region(): - +@pytest.mark.parametrize("kwargs,expected", [ + # previously failed, see issue #45 + (dict(seqid="chr2L", start=1, end=2e9, completely_within=True), 27), + (dict(region="chr2L", start=0), ValueError), + (dict(region="chr2L", end=0), ValueError), + (dict(region="chr2L", seqid=0), ValueError), + # these coords should catch everything + (dict(region="chr2L:7529-12500"), 27), + # stranded versions: + (dict(region="chr2L:7529-12500", strand="."), 0), + (dict(region="chr2L:7529-12500", strand="+"), 21), + (dict(region="chr2L:7529-12500", strand="-"), 6), + # different ways of selecting only that last exon in the last gene: + (dict(seqid="chr2L", start=11500, featuretype="exon"), 1), + (dict(seqid="chr2L", start=9500, featuretype="exon", strand="+"), 1), + # alternative method + (dict(seqid="chr2L", start=7529, end=12500), 27), + # since default completely_within=False, this catches anything that + # falls after 7680. So it only excludes the 5'UTR, which ends at 7679. + (dict(seqid="chr2L", start=7680), 26), + # but completely_within=True will exclude the gene and mRNAs, first + # exon and the 5'UTR + (dict(seqid="chr2L", start=7680, completely_within=True), 22), + # similarly, this will *exclude* anything before 7680 + (dict(seqid="chr2L", end=7680), 5), + # and also similarly, this will only get us the 5'UTR which is the only + # feature falling completely before 7680 + (dict(seqid="chr2L", end=7680, completely_within=True), 1), + # and there's only features from chr2L in this file, so this catches + # everything too + (dict(region="chr2L"), 27), + # using seqid should work similarly to `region` with only chromosome + (dict(seqid="chr2L"), 27), + # nonexistent + (dict(region="nowhere"), 0), +]) +def test_region(kwargs, expected): db_fname = gffutils.example_filename("FBgn0031208.gff") db = gffutils.create_db(db_fname, ":memory:", keep_order=True) - def _check(item): - kwargs, expected = item - try: - obs = list(db.region(**kwargs)) - assert len(obs) == expected, "expected %s got %s" % (expected, len(obs)) - except expected: - pass - - regions = [ - # previously failed, see issue #45 - (dict(seqid="chr2L", start=1, end=2e9, completely_within=True), 27), - (dict(region="chr2L", start=0), ValueError), - (dict(region="chr2L", end=0), ValueError), - (dict(region="chr2L", seqid=0), ValueError), - # these coords should catch everything - (dict(region="chr2L:7529-12500"), 27), - # stranded versions: - (dict(region="chr2L:7529-12500", strand="."), 0), - (dict(region="chr2L:7529-12500", strand="+"), 21), - (dict(region="chr2L:7529-12500", strand="-"), 6), - # different ways of selecting only that last exon in the last gene: - (dict(seqid="chr2L", start=11500, featuretype="exon"), 1), - (dict(seqid="chr2L", start=9500, featuretype="exon", strand="+"), 1), - # alternative method - (dict(seqid="chr2L", start=7529, end=12500), 27), - # since default completely_within=False, this catches anything that - # falls after 7680. So it only excludes the 5'UTR, which ends at 7679. - (dict(seqid="chr2L", start=7680), 26), - # but completely_within=True will exclude the gene and mRNAs, first - # exon and the 5'UTR - (dict(seqid="chr2L", start=7680, completely_within=True), 22), - # similarly, this will *exclude* anything before 7680 - (dict(seqid="chr2L", end=7680), 5), - # and also similarly, this will only get us the 5'UTR which is the only - # feature falling completely before 7680 - (dict(seqid="chr2L", end=7680, completely_within=True), 1), - # and there's only features from chr2L in this file, so this catches - # everything too - (dict(region="chr2L"), 27), - # using seqid should work similarly to `region` with only chromosome - (dict(seqid="chr2L"), 27), - # nonexistent - (dict(region="nowhere"), 0), - ] - - for item in regions: - yield _check, item + try: + obs = list(db.region(**kwargs)) + assert len(obs) == expected, "expected %s got %s" % (expected, len(obs)) + except expected: + pass def test_nonascii(): diff --git a/gffutils/test/test_biopython_integration.py b/gffutils/test/test_biopython_integration.py index 58c5866a..e9f8e81d 100644 --- a/gffutils/test/test_biopython_integration.py +++ b/gffutils/test/test_biopython_integration.py @@ -1,6 +1,10 @@ from gffutils import example_filename import gffutils import gffutils.biopython_integration as bp +import pytest + +# Skip tests entirely if BioPython not available +pytest.importorskip('Bio') def test_roundtrip(): diff --git a/gffutils/test/test_issues.py b/gffutils/test/test_issues.py index 79996ba5..9e4686a0 100644 --- a/gffutils/test/test_issues.py +++ b/gffutils/test/test_issues.py @@ -6,9 +6,12 @@ import os import tempfile import difflib +from pathlib import Path from textwrap import dedent import gffutils from gffutils import feature +from gffutils import helpers +from gffutils.gffwriter import GFFWriter from gffutils import merge_criteria as mc import pytest @@ -200,7 +203,10 @@ def test_pr_144(): assert f.attributes["a"] == [""] assert str(f) == ". . . . . . . . a" g = gffutils.feature.feature_from_line(str(f)) - assert g == f + g.dialect["fmt"] = "gff3" + print(g.attributes) + print(g.dialect) + assert str(g) == str(f) def test_pr_172(): @@ -452,21 +458,43 @@ def test_issue_198(): assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] - # If we remove one of the db_xref keys, then the parser sees the comma and - # figures it's a multivalue key. + # If we remove one of the db_xref keys, then previously the parser saw the + # comma and figured it was a multivalue key, and split it. Now, it's + # correctly identified as a single-value key. + # + # Note that we still have gene_synonym as a repeated key. line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";' f = feature.feature_from_line(line) + assert f.dialect["repeated keys"] # Previous result, note leading space --------------------------->| | # assert f.attributes['description'] == ['WASP family homolog 7', ' pseudogene'] + + # Current result: not split. assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] - # But removing that space before "pseudogene" means it's interpreted as - # a multivalue attribute + # Removing that space before "pseudogene" might mean it's a multivalue, but + # we decide on the convention that if keys are repeated at all, that wins. + # So we still don't split line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7,pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";' f = feature.feature_from_line(line) + assert f.dialect["repeated keys"] + assert f.attributes["description"] == ["WASP family homolog 7,pseudogene"] + + # But if we get rid of all repeated keys, it's interpreted as multiple values + line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7,pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; pseudo "true";' + f = feature.feature_from_line(line) + assert not f.dialect["repeated keys"] assert f.attributes["description"] == ["WASP family homolog 7", "pseudogene"] + # ....but if there's a ", " (comma followed by space) instead of just + # comma, then it's not split. + line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; pseudo "true";' + f = feature.feature_from_line(line) + assert not f.dialect["repeated keys"] + assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] + + # Confirm behavior of corner cases like a trailing comma line = "chr17 RefSeq CDS 6806527 6806553 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1," f = feature.feature_from_line(line) @@ -578,7 +606,7 @@ def _check(txt, expected_keys, dialect_trailing_semicolon): ) -def test_issue_213(): +def test_issue_213(tmp_path): # GFF header directives seem to be not parsed when building a db from # a file, even though it seems to work fine from a string. data = dedent( @@ -599,16 +627,73 @@ def test_issue_213(): db = gffutils.create_db(data, dbfn=":memory:", from_string=True, verbose=False) assert db.directives == ["gff-version 3"], db.directives - # Ensure they're parsed into the db from a file - tmp = tempfile.NamedTemporaryFile(delete=False).name + tmp = tmp_path / "issue_213.gff3" with open(tmp, "w") as fout: fout.write(data + "\n") - db = gffutils.create_db(tmp, ":memory:") - assert db.directives == ["gff-version 3"], db.directives - assert len(db.directives) == 1 - # Ensure they're parsed into the db from a file, and going to a file (to - # exactly replicate example in #213) - db = gffutils.create_db(tmp, dbfn="issue_213.db", force=True) - assert db.directives == ["gff-version 3"], db.directives - assert len(db.directives) == 1 + # Ensure they're parsed into the db from a file path for both str/Path. + for input_path in (str(tmp), tmp): + db = gffutils.create_db(input_path, ":memory:") + assert db.directives == ["gff-version 3"], db.directives + assert len(db.directives) == 1 + + # Ensure they're parsed into the db for all str/Path input-output + # combinations when both source and destination are file-backed. + for input_path, output_path in ( + (str(tmp), str(tmp_path / "issue_213_str_str.db")), + (str(tmp), tmp_path / "issue_213_str_path.db"), + (tmp, str(tmp_path / "issue_213_path_str.db")), + (tmp, tmp_path / "issue_213_path_path.db"), + ): + db = gffutils.create_db(input_path, dbfn=output_path, force=True) + assert db.directives == ["gff-version 3"], db.directives + assert len(db.directives) == 1 + + + +def test_pathlike_inputs(tmp_path): + """ + Ensure various functions work with Path and str. + """ + gff = Path(gffutils.example_filename("FBgn0031208.gff")) + fasta = Path(gffutils.example_filename("dm6-chr2L.fa")) + db_path = tmp_path / "pathlike.db" + out_path = tmp_path / "pathlike.gff3" + staged_gff = tmp_path / "pathlike-input.gff3" + staged_gff.write_text(gff.read_text()) + staged_gff_db = Path("%s.%s" % (staged_gff, ".db")) + + db = gffutils.create_db(gff, db_path, force=True) + assert db.dbfn == os.fspath(db_path) + + reopened = gffutils.FeatureDB(db_path) + reopened.delete([], make_backup=True) + assert (tmp_path / "pathlike.db.bak").exists() + + writer = GFFWriter(out_path) + writer.write_rec(next(reopened.all_features())) + writer.close() + assert out_path.exists() + + assert helpers.is_gff_db(db_path) + gffutils.create_db(staged_gff, staged_gff_db, force=True) + assert helpers.get_gff_db(staged_gff) == os.fspath(staged_gff_db) + + seq = reopened["FBgn0031208"].sequence(fasta) + expected_seq = reopened["FBgn0031208"].sequence(os.fspath(fasta)) + assert seq == expected_seq + +def test_issue_212(): + + + data = dedent( + """ + NC_000964.3 RefSeq CDS 410 1747 . + 0 gene_id "BSU_00010"; transcript_id "unassigned_transcript_1"; db_xref "EnsemblGenomes-Gn:BSU00010"; db_xref "EnsemblGenomes-Tr:CAB11777"; db_xref "GOA:P05648"; db_xref "InterPro:IPR001957"; db_xref "InterPro:IPR003593"; db_xref "InterPro:IPR010921"; db_xref "InterPro:IPR013159"; db_xref "InterPro:IPR013317"; db_xref "InterPro:IPR018312"; db_xref "InterPro:IPR020591"; db_xref "InterPro:IPR024633"; db_xref "InterPro:IPR027417"; db_xref "PDB:4TPS"; db_xref "SubtiList:BG10065"; db_xref "UniProtKB/Swiss-Prot:P05648"; db_xref "GenBank:NP_387882.1"; db_xref "GeneID:939978"; experiment "publication(s) with functional evidences, PMID:2167836, 2846289, 12682299, 16120674, 1779750, 28166228"; gbkey "CDS"; gene "dnaA"; locus_tag "BSU_00010"; note "Evidence 1a: Function from experimental evidences in the studied strain; PubMedId: 2167836, 2846289, 12682299, 16120674, 1779750, 28166228; Product type f : factor"; product "chromosomal replication initiator informational ATPase"; protein_id "NP_387882.1"; transl_table "11"; exon_number "1"; + """ + ) + inferred_dialect = gffutils.helpers.infer_dialect(data.split('\t')[-1]) + assert inferred_dialect["semicolon in quotes"] + + f = next(iter(gffutils.DataIterator(data, from_string=True, dialect=inferred_dialect))) + assert f.dialect["semicolon in quotes"] + assert f.attributes["note"] == ["Evidence 1a: Function from experimental evidences in the studied strain; PubMedId: 2167836, 2846289, 12682299, 16120674, 1779750, 28166228; Product type f : factor"] diff --git a/gffutils/version.py b/gffutils/version.py index 70fcf056..2508eeac 100644 --- a/gffutils/version.py +++ b/gffutils/version.py @@ -1 +1,30 @@ -version = "0.13" +import re +from importlib.metadata import PackageNotFoundError, version as distribution_version +from pathlib import Path + + +def _detect_version(): + """ + v0.14 migrated to pyproject.toml format, and the version is now only stored + there. If this package is installed, resolve the installed version. + Otherwise, inspect pyproject.toml. + """ + try: + return distribution_version("gffutils") + except PackageNotFoundError: + pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml" + try: + contents = pyproject.read_text(encoding="utf-8") + except OSError: + return "0+unknown" + + # tomllib is in py3.11+ and we're supporting earlier versions, so rely + # on regex here. Add "+unknown" to indicate possible divergence from + # the cloned checkout. + match = re.search(r'^version = "([^"]+)"$', contents, re.MULTILINE) + if match: + return match.group(1) + "+unknown" + return "0+unknown" + + +version = _detect_version() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..2d737311 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,69 @@ +[build-system] +# setuptools 77.0.3 is when support for plain text values for `license` key, +# used below, is supported. +requires = ["setuptools>=77.0.3"] +build-backend = "setuptools.build_meta" + +[project] +name = "gffutils" +version = "0.14" +readme = { file = "README.rst", content-type = "text/x-rst" } +description = "Work with GFF and GTF files in a flexible database framework" +authors = [ + { name = "Ryan Dale", email = "ryan.dale@nih.gov" }, +] +license = "MIT" +license-files = ["LICENSE"] +requires-python = ">=3.8" +dependencies = [ + "pyfaidx>=0.5.5.2", + "argh>=0.26.2", + "argcomplete>=1.9.4", + "simplejson", +] +classifiers = [ + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", +] +urls = { Homepage = "https://github.com/daler/gffutils" } + +[project.optional-dependencies] +test = [ + "pytest", +] +optional = [ + "biopython>=1.73", + "pybedtools>=0.8.0", +] +docs = [ + "make", + "numpydoc", + "sphinx", + "sphinx_rtd_theme", + "sphinx-autoapi", +] + +[tool.pytest.ini_options] +addopts = "-v --doctest-modules" +testpaths = ["gffutils"] +markers = [ + "slow: marks tests that are slow to run", +] + +[tool.setuptools] +include-package-data = true +script-files = ["gffutils/scripts/gffutils-cli"] + +[tool.setuptools.packages.find] +include = ["gffutils", "gffutils.*"] + +[tool.setuptools.package-data] +gffutils = ["test/data/*"] diff --git a/setup.py b/setup.py index ef56d1c6..1c12336c 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,4 @@ - -import os -import sys +# v0.14 migrated to pyproject.toml. from setuptools import setup -version_py = os.path.join(os.path.dirname(__file__), 'gffutils', 'version.py') -version = open(version_py).read().strip().split('=')[-1].replace('"', '') -requirements = open(os.path.join(os.path.dirname(__file__), 'requirements.txt')).readlines() -setup( - name='gffutils', - version=version, - install_requires=requirements, - packages=['gffutils', 'gffutils.scripts', 'gffutils.test', - 'gffutils.test.data'], - scripts=['gffutils/scripts/gffutils-cli'], - author='Ryan Dale', - package_dir={'gffutils': 'gffutils'}, - package_data = {'gffutils': ['test/data/*']}, - description="Work with GFF and GTF files in a flexible " - "database framework", - long_description=open("README.rst").read(), - author_email='ryan.dale@nih.gov', - url='https://github.com/daler/gffutils', - classifiers=[ - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Topic :: Software Development :: Libraries :: Python Modules', - ], -) +setup()