Source code for cardinal_pythonlib.network

#!/usr/bin/env python
# cardinal_pythonlib/network.py

"""
===============================================================================

    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of cardinal_pythonlib.

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

        https://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.

===============================================================================

**Network support functions.**

NOTES:

- ``ping`` requires root authority to create ICMP sockets in Linux
- the ``/bin/ping`` command doesn't need prior root authority (because it has
  the setuid bit set)
- For Linux, it's therefore best to use the system ``ping``.

https://stackoverflow.com/questions/2953462/pinging-servers-in-python
https://stackoverflow.com/questions/316866/ping-a-site-in-python

- Note that if you want a sub-second timeout, things get trickier.
  One option is ``fping``.

"""

import os
import ssl
import subprocess
import sys
import tempfile
from typing import BinaryIO, Dict, Generator, Iterable
import urllib.request

from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler

log = get_brace_style_log_with_null_handler(__name__)


# =============================================================================
# Ping
# =============================================================================


[docs]def ping(hostname: str, timeout_s: int = 5) -> bool:
    """
    Pings a host, using OS tools.

    Args:
        hostname: host name or IP address
        timeout_s: timeout in seconds

    Returns:
        was the ping successful?

    """
    if sys.platform == "win32":
        timeout_ms = timeout_s * 1000
        args = [
            "ping",
            hostname,
            "-n",
            "1",  # ping count
            "-w",
            str(timeout_ms),  # timeout
        ]
    elif sys.platform.startswith("linux"):
        args = [
            "ping",
            hostname,
            "-c",
            "1",  # ping count
            "-w",
            str(timeout_s),  # timeout
        ]
    else:
        raise AssertionError("Don't know how to ping on this operating system")
    proc = subprocess.Popen(
        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    proc.communicate()
    retcode = proc.returncode
    return retcode == 0  # zero success, non-zero failure


# =============================================================================
# Download things
# =============================================================================


[docs]def download(
    url: str,
    filename: str,
    skip_cert_verify: bool = True,
    headers: Dict[str, str] = None,
) -> None:
    """
    Downloads a URL to a file.

    Args:
        url:
            URL to download from
        filename:
            file to save to
        skip_cert_verify:
            skip SSL certificate check?
        headers:
            request headers (if not specified, a default will be used that
            mimics Mozilla 5.0 to avoid certain HTTP 403 errors)
    """
    headers = {"User-Agent": "Mozilla/5.0"} if headers is None else headers
    log.info("Downloading from {} to {}", url, filename)

    # urllib.request.urlretrieve(url, filename)
    # ... sometimes fails (e.g. downloading
    # https://www.openssl.org/source/openssl-1.1.0g.tar.gz under Windows) with:
    # ssl.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:777)  # noqa
    # ... due to this certificate root problem (probably because OpenSSL
    #     [used by Python] doesn't play entirely by the same rules as others?):
    # https://stackoverflow.com/questions/27804710
    # So:

    # Patching this by faking a browser request by adding User-Agent to request
    # headers, using this as example:
    # https://stackoverflow.com/questions/42863240/how-to-get-round-the-http-error-403-forbidden-with-urllib-request-using-python  # noqa

    ctx = ssl.create_default_context()  # type: ssl.SSLContext
    if skip_cert_verify:
        log.debug("Skipping SSL certificate check for " + url)
        ctx.check_hostname = False
        ctx.verify_mode = ssl.CERT_NONE
    page = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(page, context=ctx) as u, open(
        filename, "wb"
    ) as f:
        f.write(u.read())


# =============================================================================
# Generators
# =============================================================================


[docs]def gen_binary_files_from_urls(
    urls: Iterable[str], on_disk: bool = False, show_info: bool = True
) -> Generator[BinaryIO, None, None]:
    """
    Generate binary files from a series of URLs (one per URL).

    Args:
        urls: iterable of URLs
        on_disk: if ``True``, yields files that are on disk (permitting
            random access); if ``False``, yields in-memory files (which will
            not permit random access)
        show_info: show progress to the log?

    Yields:
        files, each of type :class:`BinaryIO`

    """
    for url in urls:
        if on_disk:
            # Necessary for e.g. zip processing (random access)
            with tempfile.TemporaryDirectory() as tmpdir:
                filename = os.path.join(tmpdir, "tempfile")
                download(url=url, filename=filename)
                with open(filename, "rb") as f:
                    yield f
        else:
            if show_info:
                log.info("Reading from URL: {}", url)
            with urllib.request.urlopen(url) as f:
                yield f
        if show_info:
            log.info("... finished reading from URL: {}", url)