15.1.947. tablet_qt/tools/clang_format_camcops.py

#!/usr/bin/env python

"""
tablet_qt/tools/clang_format_camcops.py

===============================================================================

    Copyright (C) 2012, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CamCOPS.

    CamCOPS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CamCOPS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

Run clang-format over all our C++ code.
The formatting is controlled by clang_format_camcops.yaml.

"""

# =============================================================================
# Imports
# =============================================================================

import argparse
from enum import Enum
import filecmp
import glob
import logging
import os
import shutil
from subprocess import Popen, PIPE
import sys
import tempfile
from typing import List, Set, Tuple

from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
from rich_argparse import RichHelpFormatter
from pygments.lexer import RegexLexer
from pygments.token import Comment, String, Text, Whitespace
from semantic_version import Version

from camcops_server.cc_modules.cc_baseconstants import (
    EXIT_SUCCESS,
    EXIT_FAILURE,
)

log = logging.getLogger(__name__)


# =============================================================================
# Constants
# =============================================================================

CLANG_FORMAT_VERSION = 15
CLANG_FORMAT_EXECUTABLE = f"clang-format-{CLANG_FORMAT_VERSION}"
DEFAULT_MAX_LINE_LENGTH = 79  # should match clang_format_camcops.yaml
DIFFTOOL = "meld"
ENC = sys.getdefaultencoding()
URL_INDICATORS = ("http://", "https://")


class Command(Enum):
    """
    Commands that can be given to this program. See "--help" for details.
    """

    CHECK = "check"
    DIFF = "diff"
    FINDLONGCOMMENTS = "findlongcomments"
    MODIFY = "modify"
    LIST = "list"
    PRINT = "print"


# =============================================================================
# Directories
# =============================================================================

THIS_DIR = os.path.dirname(os.path.realpath(__file__))
CLANG_FORMAT_BASE_FILENAME = "clang_format_camcops.yaml"
CLANG_FORMAT_STYLE_FILE = os.path.abspath(
    os.path.join(THIS_DIR, CLANG_FORMAT_BASE_FILENAME)
)
CAMCOPS_CPP_DIR = os.path.abspath(os.path.join(THIS_DIR, os.pardir))

INCLUDE_GLOBS = [
    f"{CAMCOPS_CPP_DIR}/**/*.cpp",
    f"{CAMCOPS_CPP_DIR}/**/*.h",
]
EXCLUDE_GLOBS = [
    f"{CAMCOPS_CPP_DIR}/build**/**/*.cpp",
    f"{CAMCOPS_CPP_DIR}/build**/**/*.h",
    # Code by Qt whose format we won't fiddle with too much.
    f"{CAMCOPS_CPP_DIR}/**/boxlayouthfw.*",
    f"{CAMCOPS_CPP_DIR}/**/qcustomplot.*",
    f"{CAMCOPS_CPP_DIR}/**/qtlayouthelpers.*",
    f"{CAMCOPS_CPP_DIR}/**/sqlcachedresult.*",
    f"{CAMCOPS_CPP_DIR}/**/sqlcipherdriver.*",
    f"{CAMCOPS_CPP_DIR}/**/sqlcipherhelpers.*",
    f"{CAMCOPS_CPP_DIR}/**/sqlcipherresult.*",
]


# =============================================================================
# Find and print long comments
# =============================================================================


class CppCommentLexer(RegexLexer):
    """
    Pygments lexer to find C++ comments. Based on
    https://pygments.org/docs/lexerdevelopment/, but modified slightly.
    Now it produces all lines separately from within multiline comments,
    and deals with string literals a bit.
    """

    name = "C++ comment lexer"
    tokens = {
        "root": [
            # At the root level:
            # - We may as well remove plain newlines.
            (r"[\n]", Whitespace),
            # - Anything not including a forward slash or a double quote is
            #   text.
            (r"[^/\"]+", Text),
            # - The sequence /* starts a multiline comment (state: "comment").
            #   ADDED: [\n]?, to swallow a trailing newline.
            (r"/\*", Comment.Multiline, "comment"),
            # - The sequence // makes the rest of the line a comment.
            #   We can capture its contents in one go.
            #   It was r"//.*?$", but I'm not sure what the "?" was doing.
            (r"//.*$", Comment.Singleline),
            # - A double quote enters a string literal.
            (r"\"", String, "string"),
            # - A plain forward slash is still plain text.
            (r"/", Text),
        ],
        "comment": [
            # Within a multiline comment:
            # - Anything that doesn't include a star or a slash is
            #   part of the multiline comment. I have modified to end in $,
            #   thus creating separate tokens for each line.
            #   ADDED: swallow newlines
            (r"[\n]", Whitespace),
            #   PREVIOUSLY: (r"[^*/]+", Comment.Multiline),
            (r"[^*/\n]+", Comment.Multiline),
            # - DISABLED: the Pygments example used the following, meaning that
            #   a further /* entered a "deeper" level of comment, but that is
            #   not C++ syntax.
            #   (r"/\*", Comment.Multiline, "#push"),
            # - The sequence */ ends a multiline comment.
            (r"\*/", Comment.Multiline, "#pop"),
            # - A star or a forward slash, otherwise, remains within a comment.
            (r"[*/]", Comment.Multiline),
        ],
        "string": [
            # Within a string literal:
            # - We can escape double quotes.
            (r"\\\"", String),
            # - Otherwise a double quote ends the string.
            (r"\"", String, "#pop"),
            # - Anything else is part of the string (but we need to exclude
            #   a double quote here or it swallows up to the end of the line
            #   even beyond a closing quote; I'm not entirely sure why).
            (r"[^\"]+", String),
        ],
    }


def report_line(
    filename: str, linenum: int, text: str, bare: bool = False
) -> None:
    """
    Prints a line to stdout, preceded by its filename and line number, in
    conventional format.
    """
    if bare:
        print(text)
    else:
        print(f"{filename}:{linenum}, length {len(text)}: {text}")


def get_line_at_pos(contents: str, pos: int) -> Tuple[int, str]:
    """
    Takes a multi-line string, and an integer (zero-based) position. Returns
    a tuple of the line number and the line text, containing that position.
    """
    before = contents[:pos]
    start_of_line = before[before.rfind("\n") + 1 :]
    rest_of_line = contents[pos : contents.find("\n", pos)]
    linetext = start_of_line + rest_of_line
    linenum = before.count("\n") + 1
    return linenum, linetext


def print_long_comments(
    filename: str,
    maxlinelength: int = DEFAULT_MAX_LINE_LENGTH,
    ignore_urls: bool = False,
    bare: bool = False,
    debugtokens: bool = False,
) -> int:
    """
    Print any line in the file that is longer than maxlinelength and contains,
    or is part of, a C++ comment. Returns the number of long lines.

    Args:
        filename:
            File to read.
        maxlinelength:
            Maximum permissible line length.
        ignore_urls:
            Ignore any lines that contain a string from URL_INDICATORS.
        bare:
            Print offending lines bare.
        debugtokens:
            Report tokens, for debugging.
    """
    log.debug(
        f"Searching for comment lines >{maxlinelength} characters: {filename}"
    )

    num_long_lines = 0

    lines_seen = set()  # type: Set[int]
    with open(filename) as f:
        contents = f.read()
    lexer = CppCommentLexer()
    for pos, tokentype, tokentext in lexer.get_tokens_unprocessed(contents):
        if debugtokens:
            log.debug(f"{pos=}, {tokentype=}, {tokentext=}")
        if tokentype in (Comment.Multiline, Comment.Singleline):
            linenum, linetext = get_line_at_pos(contents, pos)
            if linenum not in lines_seen:
                lines_seen.add(linenum)
                if len(linetext) > maxlinelength:
                    if ignore_urls:
                        if any(u in linetext for u in URL_INDICATORS):
                            continue
                    report_line(filename, linenum, linetext, bare=bare)
                    num_long_lines += 1

    return num_long_lines


# =============================================================================
# Apply clang-format to our source code
# =============================================================================


def runit(cmdargs: List[str]) -> Tuple[str, str, int]:
    """
    Run an external command. Return tuple: stdout, stderr, returncode.
    """
    log.debug(cmdargs)
    p = Popen(cmdargs, stdout=PIPE, stderr=PIPE)
    output, error = p.communicate()
    output = output.decode(ENC)
    error = error.decode(ENC)
    return output, error, p.returncode


def clang_format_camcops_source() -> None:
    """
    Apply clang-format to CamCOPS C++ source code, to standardize code style.
    """
    parser = argparse.ArgumentParser(formatter_class=RichHelpFormatter)
    parser.add_argument(
        "command",
        type=str,
        choices=[x.value for x in Command],
        help=f"Command to execute. "
        f"{Command.CHECK.value!r}: ensure nothing needs modifying; return "
        f"exit code {EXIT_SUCCESS} if everything is OK and {EXIT_FAILURE} if "
        f"something needs fixing. "
        f"{Command.DIFF.value!r}: launch a diff for the first file specified. "
        f"{Command.FINDLONGCOMMENTS.value!r}: show lines that include or are "
        f"part of a C++ comment and are longer than the permitted length. "
        f"{Command.LIST.value!r}: list files only. "
        f"{Command.MODIFY.value!r}: modify all files in place. "
        f"{Command.PRINT.value!r}: print all results to stdout.",
    )
    parser.add_argument(
        "files",
        type=str,
        nargs="*",
        help="Files to modify (leave blank for all).",
    )
    parser.add_argument("--verbose", action="store_true", help="Be verbose")
    parser.add_argument(
        "--clangformat",
        type=str,
        default=shutil.which(CLANG_FORMAT_EXECUTABLE),
        help=f"Path to clang-format. Priority: (1) this argument, (2) the "
        f"results of 'which {CLANG_FORMAT_EXECUTABLE}'.",
    )
    parser.add_argument(
        "--maxlinelength",
        type=int,
        default=DEFAULT_MAX_LINE_LENGTH,
        help=f"Maximum line length for {Command.FINDLONGCOMMENTS.value!r} "
        f"command (does not affect clang-format, which is governed by our "
        f"preset {CLANG_FORMAT_BASE_FILENAME})",
    )
    parser.add_argument(
        "--ignore_urls",
        action="store_true",
        help=f"For {Command.FINDLONGCOMMENTS.value!r}, ignore any line that "
        f"contains any of: {URL_INDICATORS!r}",
    )
    parser.add_argument(
        "--bare",
        action="store_true",
        help=f"For {Command.FINDLONGCOMMENTS.value!r}, print lines bare",
    )
    parser.add_argument(
        "--debugtokens",
        action="store_true",
        help=f"For {Command.FINDLONGCOMMENTS.value!r}, show tokens. "
        f"Requires --verbose",
    )
    parser.add_argument(
        "--diffall",
        action="store_true",
        help=f"For {Command.DIFF.value!r}: proceed to diff all files, not "
        f"just the first",
    )
    parser.add_argument(
        "--diffskipidentical",
        action="store_true",
        help=f"For {Command.DIFF.value!r}: skip files that are identical "
        f"after reformatting",
    )
    parser.add_argument(
        "--difftool",
        type=str,
        default=shutil.which(DIFFTOOL),
        help=f"Tool to use for diff. Priority: (1) this argument, (2) the "
        f"results of 'which {DIFFTOOL}'",
    )
    args = parser.parse_args()

    if args.clangformat is None:
        log.error(
            "No clangformat executable was found on the path "
            f"({CLANG_FORMAT_EXECUTABLE!r}) and no "
            "--clangformat argument was specified"
        )
        sys.exit(EXIT_FAILURE)

    output, error, retcode = runit([args.clangformat, "--version"])
    if retcode:
        raise RuntimeError(f"clang-format error: \n{error}")

    version_words = output.split()
    version_number_index = version_words.index("version") + 1

    version = Version(version_words[version_number_index])
    if version.major != CLANG_FORMAT_VERSION:
        log.error(
            f"clang-format version {version.major} != {CLANG_FORMAT_VERSION}"
        )
        sys.exit(EXIT_FAILURE)

    command = Command(args.command)

    main_only_quicksetup_rootlogger(
        level=logging.DEBUG if args.verbose else logging.INFO
    )

    # -------------------------------------------------------------------------
    # Files
    # -------------------------------------------------------------------------

    # Files to process:
    if args.files:
        cpp_files = args.files
    else:
        cpp_files = set()  # type: Set[str]
        for inc in INCLUDE_GLOBS:
            cpp_files = cpp_files.union(glob.glob(inc, recursive=True))
        for exc in EXCLUDE_GLOBS:
            cpp_files = cpp_files.difference(glob.glob(exc, recursive=True))
        cpp_files = sorted(cpp_files)  # type: List[str]

    # -------------------------------------------------------------------------
    # Build clazy command
    # -------------------------------------------------------------------------
    # Basic arguments
    common_clangformat_args = [
        args.clangformat,  # executable
        "--Werror",  # warnings as errors
        f"-style=file:{CLANG_FORMAT_STYLE_FILE}",  # style control file
    ]
    if command == Command.MODIFY:
        common_clangformat_args.append("-i")  # edit in place
    if args.verbose:
        common_clangformat_args.append("--verbose")  # be verbose

    # -------------------------------------------------------------------------
    # Run it
    # -------------------------------------------------------------------------
    success = True
    for filename in cpp_files:
        filename = os.path.abspath(filename)
        if command == Command.CHECK:
            log.info(f"Checking: {filename}")
        elif command == Command.DIFF:
            log.info(f"Diff: {filename}")
        elif command == Command.FINDLONGCOMMENTS:
            num_long_lines = print_long_comments(
                filename,
                maxlinelength=args.maxlinelength,
                ignore_urls=args.ignore_urls,
                bare=args.bare,
                debugtokens=args.debugtokens,
            )
            if num_long_lines > 0:
                success = False
            continue
        elif command == Command.LIST:
            print(filename)
            continue
        elif command == Command.MODIFY:
            log.info(f"Modifying: {filename}")
        else:
            log.info(f"Printing: {filename}")
        # If we are in modification mode, the next command does the
        # modification directly:
        clangformatcmd = common_clangformat_args + [filename]
        output, error, retcode = runit(clangformatcmd)
        if retcode:
            raise RuntimeError("clang-format error: \n" + error)
        if command in (Command.CHECK, Command.DIFF):
            with tempfile.TemporaryDirectory() as tempdir:
                newfilename = os.path.join(
                    tempdir, os.path.basename(filename) + ".altered"
                )
                with open(newfilename, "wt") as f:
                    f.write(output)
                if command == Command.DIFF:
                    # Diff
                    if args.diffskipidentical and filecmp.cmp(
                        filename, newfilename
                    ):
                        log.info(f"Skipping unmodified file: {filename}")
                        continue
                    diffcmd = [args.difftool, filename, newfilename]
                    runit(diffcmd)
                    if not args.diffall:
                        log.info("Stopping after first diff")
                        break  # no more files
                else:
                    # Check
                    if not filecmp.cmp(filename, newfilename):
                        # Files differ
                        log.warning(f"File would be modified: {filename}")
                        success = False

        elif command == Command.PRINT:
            # Print
            print(output)

    sys.exit(EXIT_SUCCESS if success else EXIT_FAILURE)


# =============================================================================
# Command-line entry point
# =============================================================================

if __name__ == "__main__":
    clang_format_camcops_source()