Source code for pypi_simple.filenames

import re
from   typing import Optional, Tuple, Union

PROJECT_NAME = r'[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?'
PROJECT_NAME_NODASH = r'[A-Za-z0-9](?:[A-Za-z0-9._]*[A-Za-z0-9])?'
VERSION = r'[A-Za-z0-9_.!+-]+?'
VERSION_NODASH = r'[A-Za-z0-9_.!+]+?'
ARCHIVE_EXT = r'\.(?:tar|tar\.(?:bz2|gz|lz|lzma|xz|Z)|tbz|tgz|tlz|txz|zip)'
PLAT_NAME = r'(?:aix|cygwin|darwin|linux|macosx|solaris|sunos|[wW]in)[-.A-Za-z0-9_]*'
PYVER = r'py[0-9]+\.[0-9]+'

#: Regexes for package filenames that can be parsed unambiguously
GOOD_PACKAGE_RGXN = [
    # See <https://setuptools.readthedocs.io/en/latest
    #      /formats.html#filename-embedded-metadata>:
    ('egg', re.compile(r'^(?P<project>{})-(?P<version>{})(?:-{}(?:-{})?)?\.egg$'
               .format(PROJECT_NAME_NODASH, VERSION_NODASH, PYVER, PLAT_NAME))),

    # See <http://ftp.rpm.org/max-rpm/ch-rpm-file-format.html>:
    # (The architecture pattern is mainly just a guess based on what's
    # currently on PyPI.)
    ('rpm', re.compile(r'^(?P<project>{})-(?P<version>{})-[^-]+\.[A-Za-z0-9._]+\.rpm$'
                       .format(PROJECT_NAME, VERSION_NODASH))),

    # Regex adapted from <https://git.io/fAclu>:
    ('wheel', re.compile(r'^(?P<project>{})-(?P<version>{})(-[0-9][^-]*?)?'
                         r'-.+?-.+?-.+?\.whl$'
                         .format(PROJECT_NAME_NODASH, VERSION_NODASH))),
]

#: Partial regexes for package filenames with ambiguous grammars.  If a hint as
#: to the expected project name is given, it will be prepended to the regexes
#: when trying to determine a match; otherwise, a generic pattern that matches
#: all project names will be prepended.
BAD_PACKAGE_BASES = [
    # See <https://git.io/fAclc>:
    ('dumb', re.compile(r'-(?P<version>{})\.{}{}$'
                        .format(VERSION, PLAT_NAME, ARCHIVE_EXT))),

    # See <https://git.io/fAclv>:
    ('msi', re.compile(r'-(?P<version>{})\.{}(?:-{})?\.msi$'
                       .format(VERSION, PLAT_NAME, PYVER))),

    ('sdist', re.compile(r'-(?P<version>{}){}$'.format(VERSION, ARCHIVE_EXT))),

    # See <https://git.io/fAclL>:
    ('wininst', re.compile(r'-(?P<version>{})\.{}(?:-{})?\.exe$'
                           .format(VERSION, PLAT_NAME, PYVER))),
]

#: Regexes for package filenames with ambiguous grammars, using a generic
#: pattern that matches all project names
BAD_PACKAGE_RGXN = [
    (pkg_type, re.compile('^(?P<project>' + PROJECT_NAME + ')' + rgx.pattern))
    for pkg_type, rgx in BAD_PACKAGE_BASES
]

[docs]def parse_filename(filename: str, project_hint: Optional[str] = None) \ -> Union[Tuple[str, str, str], Tuple[None, None, None]]: """ Given the filename of a distribution package, returns a triple of the project name, project version, and package type. The name and version are spelled the same as they appear in the filename; no normalization is performed. The package type may be any of the following strings: - ``'dumb'`` - ``'egg'`` - ``'msi'`` - ``'rpm'`` - ``'sdist'`` - ``'wheel'`` - ``'wininst'`` If the filename cannot be parsed, ``(None, None, None)`` is returned. Note that some filenames (e.g., :file:`1-2-3.tar.gz`) may be ambiguous as to which part is the project name and which is the version. In order to resolve the ambiguity, the expected value for the project name (*modulo* normalization) can be supplied as the ``project_name`` argument to the function. If the filename can be parsed with the given string in the role of the project name, the results of that parse will be returned; otherwise, the function will fall back to breaking the project & version apart at an unspecified point. :param str filename: The package filename to parse :param Optional[str] project_hint: Optionally, the expected value for the project name (usually the name of the project page on which the filename was found). The name does not need to be normalized. :rtype: Union[Tuple[str, str, str], Tuple[None, None, None]] """ for pkg_type, rgx in GOOD_PACKAGE_RGXN: m = rgx.match(filename) if m: return (m.group('project'), m.group('version'), pkg_type) if project_hint is not None: proj_rgx = re.sub(r'[^A-Za-z0-9]+', '[-_.]+', project_hint) proj_rgx = re.sub( r'([A-Za-z])', lambda m: '[' + m.group(1).upper() + m.group(1).lower() + ']', proj_rgx, ) m = re.match(proj_rgx + r'(?=-)', filename) if m: project = m.group(0) rest_of_name = filename[m.end(0):] for pkg_type, rgx in BAD_PACKAGE_BASES: m = rgx.match(rest_of_name) if m: return (project, m.group('version'), pkg_type) for pkg_type, rgx in BAD_PACKAGE_RGXN: m = rgx.match(filename) if m: return (m.group('project'), m.group('version'), pkg_type) return (None, None, None)