Source code for pypi_simple.filenames

from __future__ import annotations
import re
from typing import Optional
from .errors import UnparsableFilenameError

PROJECT_NAME = r"[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?"
PROJECT_NAME_NODASH = r"[A-Za-z0-9](?:[A-Za-z0-9._]*[A-Za-z0-9])?"
VERSION = r"[A-Za-z0-9_.!+-]+?"
VERSION_NODASH = r"[A-Za-z0-9_.!+]+?"
ARCHIVE_EXT = r"\.(?:tar|tar\.(?:bz2|gz|lz|lzma|xz|Z)|tbz|tgz|tlz|txz|zip)"
PLAT_NAME = r"(?:aix|cygwin|darwin|linux|macosx|solaris|sunos|[wW]in)[-.A-Za-z0-9_]*"
PYVER = r"py[0-9]+\.[0-9]+"

#: Regexes for package filenames that can be parsed unambiguously
GOOD_PACKAGE_RGXN = [
    # See <https://setuptools.readthedocs.io/en/latest
    #      /formats.html#filename-embedded-metadata>:
    (
        "egg",
        re.compile(
            r"^(?P<project>{})-(?P<version>{})(?:-{}(?:-{})?)?\.egg$".format(
                PROJECT_NAME_NODASH, VERSION_NODASH, PYVER, PLAT_NAME
            )
        ),
    ),
    # See <http://ftp.rpm.org/max-rpm/ch-rpm-file-format.html>:
    # (The architecture pattern is mainly just a guess based on what's
    # currently on PyPI.)
    (
        "rpm",
        re.compile(
            r"^(?P<project>{})-(?P<version>{})-[^-]+\.[A-Za-z0-9._]+\.rpm$".format(
                PROJECT_NAME, VERSION_NODASH
            )
        ),
    ),
    # Regex adapted from <https://github.com/pypa/pip/blob/18.0/src/pip/_internal/wheel.py#L569>:
    (
        "wheel",
        re.compile(
            r"^(?P<project>{})-(?P<version>{})(-[0-9][^-]*?)?"
            r"-.+?-.+?-.+?\.whl$".format(PROJECT_NAME_NODASH, VERSION_NODASH)
        ),
    ),
]

#: Partial regexes for package filenames with ambiguous grammars.  If a hint as
#: to the expected project name is given, it will be prepended to the regexes
#: when trying to determine a match; otherwise, a generic pattern that matches
#: all project names will be prepended.
BAD_PACKAGE_BASES = [
    # See <https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_dumb.py#L93>:
    (
        "dumb",
        re.compile(r"-(?P<version>{})\.{}{}$".format(VERSION, PLAT_NAME, ARCHIVE_EXT)),
    ),
    # See <https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_msi.py#L733>:
    (
        "msi",
        re.compile(
            r"-(?P<version>{})\.{}(?:-{})?\.msi$".format(VERSION, PLAT_NAME, PYVER)
        ),
    ),
    ("sdist", re.compile(r"-(?P<version>{}){}$".format(VERSION, ARCHIVE_EXT))),
    # See <https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_wininst.py#L292>:
    (
        "wininst",
        re.compile(
            r"-(?P<version>{})\.{}(?:-{})?\.exe$".format(VERSION, PLAT_NAME, PYVER)
        ),
    ),
]

#: Regexes for package filenames with ambiguous grammars, using a generic
#: pattern that matches all project names
BAD_PACKAGE_RGXN = [
    (pkg_type, re.compile("^(?P<project>" + PROJECT_NAME + ")" + rgx.pattern))
    for pkg_type, rgx in BAD_PACKAGE_BASES
]


[docs] def parse_filename( filename: str, project_hint: Optional[str] = None ) -> tuple[str, str, str]: """ Given the filename of a distribution package, returns a triple of the project name, project version, and package type. The name and version are spelled the same as they appear in the filename; no normalization is performed. The package type will be one of the following strings: - ``"dumb"`` - ``"egg"`` - ``"msi"`` - ``"rpm"`` - ``"sdist"`` - ``"wheel"`` - ``"wininst"`` Note that some filenames (e.g., :file:`1-2-3.tar.gz`) may be ambiguous as to which part is the project name and which is the version. In order to resolve the ambiguity, the expected value for the project name can be supplied as the ``project_name`` argument to the function; it need not be normalized. If the filename can be parsed with the given string in the role of the project name, the results of that parse will be returned; otherwise, the function will fall back to breaking the project & version apart at an unspecified point. .. versionchanged:: 1.0.0 Now raises `UnparsableFilenameError` for unparsable filenames instead of returning all `None`\\s :param str filename: The package filename to parse :param Optional[str] project_hint: Optionally, the expected value for the project name (usually the name of the project page on which the filename was found). The name does not need to be normalized. :rtype: tuple[str, str, str] :raises UnparsableFilenameError: if the filename cannot be parsed """ for pkg_type, rgx in GOOD_PACKAGE_RGXN: m = rgx.match(filename) if m: return (m.group("project"), m.group("version"), pkg_type) if project_hint is not None: proj_rgx = re.sub(r"[^A-Za-z0-9]+", "[-_.]+", project_hint) proj_rgx = re.sub( r"([A-Za-z])", lambda m: "[" + m.group(1).upper() + m.group(1).lower() + "]", proj_rgx, ) m = re.match(proj_rgx + r"(?=-)", filename) if m: project = m.group(0) rest_of_name = filename[m.end(0) :] for pkg_type, rgx in BAD_PACKAGE_BASES: m = rgx.match(rest_of_name) if m: return (project, m.group("version"), pkg_type) for pkg_type, rgx in BAD_PACKAGE_RGXN: m = rgx.match(filename) if m: return (m.group("project"), m.group("version"), pkg_type) raise UnparsableFilenameError(filename)