Source code for cardinal_pythonlib.source_reformatting

#!/usr/bin/env python

"""
tools/reformat_source.py

===============================================================================

    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of cardinal_pythonlib.

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

        https://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.

===============================================================================

**Clean up source code.**

"""

import logging
from os import walk
from os.path import join, splitext
from sys import stdout
from typing import List, TextIO

from cardinal_pythonlib.fileops import relative_filename_within_dir
from cardinal_pythonlib.logs import BraceStyleAdapter

log = BraceStyleAdapter(logging.getLogger(__name__))

TRANSITION = "==============================================================================="  # noqa
CORRECT_SHEBANG = "#!/usr/bin/env python"
RST_COMMENT_LINE = ".."
SHEBANG_START = "#!"
TRIPLE_DOUBLEQUOTE = '"""'
RAW_TRIPLE_DOUBLEQUOTE = 'r"""'
BLANK = ""
MISSING_RST_TITLE = "**Missing title.**"

CR = "\r"
LF = "\n"
NL = LF
SPACE = " "
TAB = "\t"
HASH = "#"
HASH_SPACE = "# "
PYTHON_EXTENSION = ".py"


# =============================================================================
# PythonProcessor
# =============================================================================


[docs]class PythonProcessor(object):
    """
    Class to read a Python source file and reformat its shebang/docstring etc.
    """

    def __init__(
        self, full_path: str, top_dir: str, correct_copyright_lines: List[str]
    ) -> None:
        """

        Args:
            full_path:
                full path to source file
            top_dir:
                directory from which we calculate a relative filename to be
                shown
            correct_copyright_lines:
                list of lines (without newlines) representing the copyright
                docstring block, including the transition lines of equals
                symbols
        """
        self.full_path = full_path
        self.advertised_filename = relative_filename_within_dir(
            full_path, top_dir
        )
        self.correct_copyright_lines = correct_copyright_lines
        self.needs_rewriting = False
        self.source_lines = []  # type: List[str]
        self.dest_lines = []  # type: List[str]
        self._read_source()
        self._create_dest()

    def _read_source(self) -> None:
        """
        Reads the source file.
        """
        with open(self.full_path, "rt") as f:
            for linenum, line_with_nl in enumerate(f.readlines(), start=1):
                line_without_newline = (
                    line_with_nl[:-1]
                    if line_with_nl.endswith(NL)
                    else line_with_nl
                )
                if TAB in line_without_newline:
                    self._warn(f"Tab character at line {linenum}")
                if CR in line_without_newline:
                    self._warn(
                        f"Carriage return character at line {linenum} "
                        f"(Windows CR+LF endings?)"
                    )
                self.source_lines.append(line_without_newline)

    def _create_dest(self) -> None:
        """
        Creates an internal representation of the destination file.

        This is where the thinking happens
        """
        in_body = False
        in_docstring = False
        in_copyright = False
        copyright_done = False
        docstring_done = False
        swallow_blanks_and_filename_in_docstring = False
        for linenum, sl in enumerate(self.source_lines, start=1):
            dl = sl

            if dl.endswith(SPACE):
                self._debug(f"Line {linenum} ends in whitespace")
                dl = dl.rstrip()

            if not in_body:

                if linenum == 1:
                    # Shebang
                    if not dl.startswith(SHEBANG_START):
                        self._warn(
                            f"File does not start with shebang; "
                            f"first line was {dl!r}"
                        )
                        self._too_risky()
                        return
                    if dl != CORRECT_SHEBANG:
                        self._debug(f"Rewriting shebang; was {dl!r}")
                    dl = CORRECT_SHEBANG

                if (
                    linenum == 2
                    and dl.startswith(HASH_SPACE)
                    and dl.endswith(PYTHON_EXTENSION)
                ):
                    self._debug(f"Removing filename comment: {dl!r}")
                    dl = None

                elif TRIPLE_DOUBLEQUOTE in dl:
                    if not dl.startswith(
                        TRIPLE_DOUBLEQUOTE
                    ) and not dl.startswith(RAW_TRIPLE_DOUBLEQUOTE):
                        self._warn(
                            "Triple-quote not at start of line, as follows"
                        )
                        self._debug_line(linenum, dl)
                        self._too_risky()
                        return
                    if in_docstring:  # docstring finishing
                        in_docstring = False
                        docstring_done = True
                        in_body = True
                        # ... and keep dl, so we write the end of the
                        # docstring, potentially with e.g. "# noqa" on the end
                    elif not docstring_done:  # docstring starting
                        in_docstring = True
                        # self._critical("adding our new docstring")
                        # Write our new docstring's start
                        tdq = ""  # stops linter moaning
                        if dl.startswith(TRIPLE_DOUBLEQUOTE):
                            tdq = TRIPLE_DOUBLEQUOTE
                        elif dl.startswith(RAW_TRIPLE_DOUBLEQUOTE):
                            tdq = RAW_TRIPLE_DOUBLEQUOTE
                        else:
                            assert "Bug!"
                        self.dest_lines.append(tdq)
                        self.dest_lines.append(self.advertised_filename)
                        self.dest_lines.append(BLANK)
                        self.dest_lines.extend(self.correct_copyright_lines)
                        self.dest_lines.append(BLANK)
                        swallow_blanks_and_filename_in_docstring = True
                        if dl == tdq:
                            dl = None  # don't write another triple-quote line
                        else:
                            dl = dl[len(tdq) :]

                elif in_docstring:
                    # Reading within the source docstring

                    if dl == TRANSITION:
                        if in_copyright:  # copyright finishing
                            in_copyright = False
                            copyright_done = True
                            dl = None  # we've already replaced with our own
                        elif not copyright_done:
                            in_copyright = True
                            dl = None  # we've already replaced with our own

                    elif in_copyright:
                        dl = None  # we've already replaced with our own

                    elif dl == RST_COMMENT_LINE:
                        dl = None  # remove these

                    elif swallow_blanks_and_filename_in_docstring:
                        # self._debug_line(linenum, dl)
                        if dl == BLANK or dl == self.advertised_filename:
                            dl = None
                        elif copyright_done:
                            swallow_blanks_and_filename_in_docstring = False

                elif not dl.startswith(HASH) and not dl == BLANK:
                    in_body = True

                    if not docstring_done:
                        # The source file didn't have a docstring!
                        new_docstring_lines = (
                            [
                                BLANK,
                                TRIPLE_DOUBLEQUOTE,
                                self.advertised_filename,
                                BLANK,
                            ]
                            + self.correct_copyright_lines
                            + [
                                BLANK,
                                MISSING_RST_TITLE,
                                BLANK,
                                TRIPLE_DOUBLEQUOTE,
                            ]
                        )
                        self._warn(
                            f"File had no docstring; adding one. "
                            f"Will need manual edit to add RST title. "
                            f"Search for {MISSING_RST_TITLE!r}"
                        )
                        self.dest_lines[1:1] = new_docstring_lines

            if dl is not None:
                # self._debug_line(linenum, dl, "adding ")
                self.dest_lines.append(dl)

        self.needs_rewriting = self.dest_lines != self.source_lines

    @staticmethod
    def _debug_line(linenum: int, line: str, extramsg: str = "") -> None:
        """
        Writes a debugging report on a line.
        """
        log.critical("{}Line {}: {!r}", extramsg, linenum, line)

    def _logmsg(self, msg: str) -> str:
        """
        Formats a log message.
        """
        return f"{self.advertised_filename}: {msg}"

    def _critical(self, msg: str) -> None:
        """
        Shows a critical message.
        """
        log.critical(self._logmsg(msg))

    def _warn(self, msg: str) -> None:
        """
        Shows a warning.
        """
        log.warning(self._logmsg(msg))

    def _info(self, msg: str) -> None:
        """
        Shows an info message.
        """
        log.info(self._logmsg(msg))

    def _debug(self, msg: str) -> None:
        """
        Shows a debugging message.
        """
        log.debug(self._logmsg(msg))

    def _too_risky(self) -> None:
        """
        Shows a warning and sets this file as not for processing
        """
        self._warn("Don't know how to process file")
        self.needs_rewriting = False

[docs]    def show(self) -> None:
        """
        Writes the destination to stdout.
        """
        self._write(stdout)

[docs]    def rewrite_file(self) -> None:
        """
        Rewrites the source file.
        """
        if not self.needs_rewriting:
            return
        self._info("Rewriting file")
        with open(self.full_path, "w") as outfile:
            self._write(outfile)

    def _write(self, destination: TextIO) -> None:
        """
        Writes the converted output to a destination.
        """
        for line in self.dest_lines:
            destination.write(line + NL)


# =============================================================================
# Top-level functions
# =============================================================================


[docs]def reformat_python_docstrings(
    top_dirs: List[str],
    correct_copyright_lines: List[str],
    show_only: bool = True,
    rewrite: bool = False,
    process_only_filenum: int = None,
) -> None:
    """
    Walk a directory, finding Python files and rewriting them.

    Args:
        top_dirs: list of directories to descend into
        correct_copyright_lines:
            list of lines (without newlines) representing the copyright
            docstring block, including the transition lines of equals
            symbols
        show_only: show results (to stdout) only; don't rewrite
        rewrite: write the changes
        process_only_filenum: only process this file number (1-based index);
            for debugging only
    """
    filenum = 0
    for top_dir in top_dirs:
        for dirpath, dirnames, filenames in walk(top_dir):
            for filename in filenames:
                fullname = join(dirpath, filename)
                extension = splitext(filename)[1]
                if extension != PYTHON_EXTENSION:
                    # log.debug("Skipping non-Python file: {}", fullname)
                    continue

                filenum += 1

                if process_only_filenum and filenum != process_only_filenum:
                    continue

                log.info("Processing file {}: {}", filenum, fullname)
                proc = PythonProcessor(
                    full_path=fullname,
                    top_dir=top_dir,
                    correct_copyright_lines=correct_copyright_lines,
                )
                if show_only:
                    proc.show()
                elif rewrite:
                    proc.rewrite_file()