Source code for pypi_simple.parse_repo

from   typing       import Dict, List, Optional, Tuple, Union
from   urllib.parse import urljoin
from   bs4          import BeautifulSoup
import requests
from   .classes     import DistributionPackage, IndexPage, Link, ProjectPage
from   .util        import check_repo_version

[docs]def parse_repo_links(
    html: Union[str, bytes],
    base_url: Optional[str] = None,
    from_encoding: Optional[str] = None,
) -> Tuple[Dict[str, str], List[Link]]:
    """
    .. versionadded:: 0.7.0

    Parse an HTML page from a simple repository and return a ``(metadata,
    links)`` pair.

    The ``metadata`` element is a ``Dict[str, str]``.  Currently, the only key
    that may appear in it is ``"repository_version"``, which maps to the
    repository version reported by the HTML page in accordance with :pep:`629`.
    If the HTML page does not contain a repository version, this key is absent
    from the `dict`.

    The ``links`` element is a list of `Link` objects giving the hyperlinks
    found in the HTML page.

    :param html: the HTML to parse
    :type html: str or bytes
    :param Optional[str] base_url: an optional URL to join to the front of the
        links' URLs (usually the URL of the page being parsed)
    :param Optional[str] from_encoding: an optional hint to Beautiful Soup as
        to the encoding of ``html`` when it is `bytes` (usually the ``charset``
        parameter of the response's :mailheader:`Content-Type` header)
    :rtype: Tuple[Dict[str, str], List[Link]]
    :raises UnsupportedRepoVersionError: if the repository version has a
        greater major component than the supported repository version
    """
    soup = BeautifulSoup(html, 'html.parser', from_encoding=from_encoding)
    base_tag = soup.find('base', href=True)
    if base_tag is not None:
        if base_url is None:
            base_url = base_tag['href']
        else:
            base_url = urljoin(base_url, base_tag['href'])
    if base_url is None:
        def basejoin(url: str) -> str:
            return url
    else:
        def basejoin(url: str) -> str:
            assert isinstance(base_url, str)
            return urljoin(base_url, url)
    metadata = {}
    pep629_meta = soup.find(
        'meta',
        attrs = {"name": "pypi:repository-version", "content": True},
    )
    if pep629_meta is not None:
        metadata["repository_version"] = pep629_meta["content"]
        check_repo_version(metadata["repository_version"])
    links = []
    for link in soup.find_all('a', href=True):
        links.append(Link(
            text  = ''.join(link.strings).strip(),
            url   = basejoin(link['href']),
            attrs = link.attrs,
        ))
    return (metadata, links)

[docs]def parse_repo_project_page(
    project: str,
    html: Union[str, bytes],
    base_url: Optional[str] = None,
    from_encoding: Optional[str] = None,
) -> ProjectPage:
    """
    .. versionadded:: 0.7.0

    Parse a project page from a simple repository into a `ProjectPage`.  Note
    that the `~ProjectPage.last_serial` attribute will be `None`.

    :param str project: The name of the project whose page is being parsed
    :param html: the HTML to parse
    :type html: str or bytes
    :param Optional[str] base_url: an optional URL to join to the front of the
        packages' URLs (usually the URL of the page being parsed)
    :param Optional[str] from_encoding: an optional hint to Beautiful Soup as
        to the encoding of ``html`` when it is `bytes` (usually the ``charset``
        parameter of the response's :mailheader:`Content-Type` header)
    :rtype: ProjectPage
    :raises UnsupportedRepoVersionError: if the repository version has a
        greater major component than the supported repository version
    """
    metadata, links = parse_repo_links(html, base_url, from_encoding)
    return ProjectPage(
        project = project,
        packages = [
            DistributionPackage.from_link(link, project) for link in links
        ],
        repository_version = metadata.get("repository_version"),
        last_serial = None,
    )

[docs]def parse_repo_project_response(project: str, r: requests.Response) \
        -> ProjectPage:
    """
    .. versionadded:: 0.7.0

    Parse a project page from a `requests.Response` returned from a
    (non-streaming) request to a simple repository, and return a `ProjectPage`.

    :param str project: The name of the project whose page is being parsed
    :param requests.Response r: the response object to parse
    :rtype: ProjectPage
    :raises UnsupportedRepoVersionError: if the repository version has a
        greater major component than the supported repository version
    """
    charset: Optional[str]
    if 'charset' in r.headers.get('content-type', '').lower():
        charset = r.encoding
    else:
        charset = None
    page = parse_repo_project_page(
        project       = project,
        html          = r.content,
        base_url      = r.url,
        from_encoding = charset,
    )
    return page._replace(last_serial=r.headers.get("X-PyPI-Last-Serial"))

[docs]def parse_repo_index_page(
    html: Union[str, bytes],
    from_encoding: Optional[str] = None,
) -> IndexPage:
    """
    .. versionadded:: 0.7.0

    Parse an index/root page from a simple repository into an `IndexPage`.
    Note that the `~IndexPage.last_serial` attribute will be `None`.

    :param html: the HTML to parse
    :type html: str or bytes
    :param Optional[str] from_encoding: an optional hint to Beautiful Soup as
        to the encoding of ``html`` when it is `bytes` (usually the ``charset``
        parameter of the response's :mailheader:`Content-Type` header)
    :rtype: IndexPage
    :raises UnsupportedRepoVersionError: if the repository version has a
        greater major component than the supported repository version
    """
    metadata, links = parse_repo_links(html, from_encoding=from_encoding)
    return IndexPage(
        projects = [link.text for link in links],
        repository_version = metadata.get("repository_version"),
        last_serial = None,
    )

[docs]def parse_repo_index_response(r: requests.Response) -> IndexPage:
    """
    .. versionadded:: 0.7.0

    Parse an index page from a `requests.Response` returned from a
    (non-streaming) request to a simple repository, and return an `IndexPage`.

    :param requests.Response r: the response object to parse
    :rtype: IndexPage
    :raises UnsupportedRepoVersionError: if the repository version has a
        greater major component than the supported repository version
    """
    charset: Optional[str]
    if 'charset' in r.headers.get('content-type', '').lower():
        charset = r.encoding
    else:
        charset = None
    page = parse_repo_index_page(html=r.content, from_encoding=charset)
    return page._replace(last_serial=r.headers.get("X-PyPI-Last-Serial"))