TexTeller/texteller/api/format.py

#!/usr/bin/env python3
"""
Python implementation of tex-fmt, a LaTeX formatter.
Based on the Rust implementation at https://github.com/WGUNDERWOOD/tex-fmt
"""

import re
from dataclasses import dataclass
from typing import List, Optional, Tuple

# Constants
LINE_END = "\n"
ITEM = "\\item"
DOC_BEGIN = "\\begin{document}"
DOC_END = "\\end{document}"
ENV_BEGIN = "\\begin{"
ENV_END = "\\end{"
TEXT_LINE_START = ""
COMMENT_LINE_START = "% "

# Opening and closing delimiters
OPENS = ['{', '(', '[']
CLOSES = ['}', ')', ']']

# Names of LaTeX verbatim environments
VERBATIMS = ["verbatim", "Verbatim", "lstlisting", "minted", "comment"]
VERBATIMS_BEGIN = [f"\\begin{{{v}}}" for v in VERBATIMS]
VERBATIMS_END = [f"\\end{{{v}}}" for v in VERBATIMS]

# Regex patterns for sectioning commands
SPLITTING = [
    r"\\begin\{",
    r"\\end\{",
    r"\\item(?:$|[^a-zA-Z])",
    r"\\(?:sub){0,2}section\*?\{",
    r"\\chapter\*?\{",
    r"\\part\*?\{",
]

# Compiled regexes
SPLITTING_STRING = f"({'|'.join(SPLITTING)})"
RE_NEWLINES = re.compile(f"{LINE_END}{LINE_END}({LINE_END})+")
RE_TRAIL = re.compile(f" +{LINE_END}")
RE_SPLITTING = re.compile(SPLITTING_STRING)
RE_SPLITTING_SHARED_LINE = re.compile(f"(?:\\S.*?)(?:{SPLITTING_STRING}.*)")
RE_SPLITTING_SHARED_LINE_CAPTURE = re.compile(f"(?P<prev>\\S.*?)(?P<env>{SPLITTING_STRING}.*)")


@dataclass
class Args:
    """Formatter configuration."""

    tabchar: str = " "
    tabsize: int = 4
    wrap: bool = False
    wraplen: int = 80
    wrapmin: int = 40
    lists: List[str] = None
    verbosity: int = 0

    def __post_init__(self):
        if self.lists is None:
            self.lists = []


@dataclass
class Ignore:
    """Information on the ignored state of a line."""

    actual: bool = False
    visual: bool = False

    @classmethod
    def new(cls):
        return cls(False, False)


@dataclass
class Verbatim:
    """Information on the verbatim state of a line."""

    actual: int = 0
    visual: bool = False

    @classmethod
    def new(cls):
        return cls(0, False)


@dataclass
class Indent:
    """Information on the indentation state of a line."""

    actual: int = 0
    visual: int = 0

    @classmethod
    def new(cls):
        return cls(0, 0)


@dataclass
class State:
    """Information on the current state during formatting."""

    linum_old: int = 1
    linum_new: int = 1
    ignore: Ignore = None
    indent: Indent = None
    verbatim: Verbatim = None
    linum_last_zero_indent: int = 1

    def __post_init__(self):
        if self.ignore is None:
            self.ignore = Ignore.new()
        if self.indent is None:
            self.indent = Indent.new()
        if self.verbatim is None:
            self.verbatim = Verbatim.new()


@dataclass
class Pattern:
    """Record whether a line contains certain patterns."""

    contains_env_begin: bool = False
    contains_env_end: bool = False
    contains_item: bool = False
    contains_splitting: bool = False
    contains_comment: bool = False

    @classmethod
    def new(cls, s: str):
        """Check if a string contains patterns."""
        if RE_SPLITTING.search(s):
            return cls(
                contains_env_begin=ENV_BEGIN in s,
                contains_env_end=ENV_END in s,
                contains_item=ITEM in s,
                contains_splitting=True,
                contains_comment='%' in s,
            )
        else:
            return cls(
                contains_env_begin=False,
                contains_env_end=False,
                contains_item=False,
                contains_splitting=False,
                contains_comment='%' in s,
            )


@dataclass
class Log:
    """Log message."""

    level: str
    file: str
    message: str
    linum_new: Optional[int] = None
    linum_old: Optional[int] = None
    line: Optional[str] = None


def find_comment_index(line: str, pattern: Pattern) -> Optional[int]:
    """Find the index of a comment in a line."""
    if not pattern.contains_comment:
        return None

    in_command = False
    for i, c in enumerate(line):
        if c == '\\':
            in_command = True
        elif in_command and not c.isalpha():
            in_command = False
        elif c == '%' and not in_command:
            return i

    return None


def contains_ignore_skip(line: str) -> bool:
    """Check if a line contains a skip directive."""
    return line.endswith("% tex-fmt: skip")


def contains_ignore_begin(line: str) -> bool:
    """Check if a line contains the start of an ignore block."""
    return line.endswith("% tex-fmt: off")


def contains_ignore_end(line: str) -> bool:
    """Check if a line contains the end of an ignore block."""
    return line.endswith("% tex-fmt: on")


def get_ignore(line: str, state: State, logs: List[Log], file: str, warn: bool) -> Ignore:
    """Determine whether a line should be ignored."""
    skip = contains_ignore_skip(line)
    begin = contains_ignore_begin(line)
    end = contains_ignore_end(line)

    if skip:
        actual = state.ignore.actual
        visual = True
    elif begin:
        actual = True
        visual = True
        if warn and state.ignore.actual:
            logs.append(
                Log(
                    level="WARN",
                    file=file,
                    message="Cannot begin ignore block:",
                    linum_new=state.linum_new,
                    linum_old=state.linum_old,
                    line=line,
                )
            )
    elif end:
        actual = False
        visual = True
        if warn and not state.ignore.actual:
            logs.append(
                Log(
                    level="WARN",
                    file=file,
                    message="No ignore block to end.",
                    linum_new=state.linum_new,
                    linum_old=state.linum_old,
                    line=line,
                )
            )
    else:
        actual = state.ignore.actual
        visual = state.ignore.actual

    return Ignore(actual=actual, visual=visual)


def get_verbatim_diff(line: str, pattern: Pattern) -> int:
    """Calculate total verbatim depth change."""
    if pattern.contains_env_begin and any(r in line for r in VERBATIMS_BEGIN):
        return 1
    elif pattern.contains_env_end and any(r in line for r in VERBATIMS_END):
        return -1
    else:
        return 0


def get_verbatim(
    line: str, state: State, logs: List[Log], file: str, warn: bool, pattern: Pattern
) -> Verbatim:
    """Determine whether a line is in a verbatim environment."""
    diff = get_verbatim_diff(line, pattern)
    actual = state.verbatim.actual + diff
    visual = actual > 0 or state.verbatim.actual > 0

    if warn and actual < 0:
        logs.append(
            Log(
                level="WARN",
                file=file,
                message="Verbatim count is negative.",
                linum_new=state.linum_new,
                linum_old=state.linum_old,
                line=line,
            )
        )

    return Verbatim(actual=actual, visual=visual)


def get_diff(line: str, pattern: Pattern, lists_begin: List[str], lists_end: List[str]) -> int:
    """Calculate total indentation change due to the current line."""
    diff = 0

    # Other environments get single indents
    if pattern.contains_env_begin and ENV_BEGIN in line:
        # Documents get no global indentation
        if DOC_BEGIN in line:
            return 0
        diff += 1
        diff += 1 if any(r in line for r in lists_begin) else 0
    elif pattern.contains_env_end and ENV_END in line:
        # Documents get no global indentation
        if DOC_END in line:
            return 0
        diff -= 1
        diff -= 1 if any(r in line for r in lists_end) else 0

    # Indent for delimiters
    for c in line:
        if c in OPENS:
            diff += 1
        elif c in CLOSES:
            diff -= 1

    return diff


def get_back(line: str, pattern: Pattern, state: State, lists_end: List[str]) -> int:
    """Calculate dedentation for the current line."""
    # Only need to dedent if indentation is present
    if state.indent.actual == 0:
        return 0

    if pattern.contains_env_end and ENV_END in line:
        # Documents get no global indentation
        if DOC_END in line:
            return 0
        # List environments get double indents for indenting items
        for r in lists_end:
            if r in line:
                return 2
        return 1

    # Items get dedented
    if pattern.contains_item and ITEM in line:
        return 1

    return 0


def get_indent(
    line: str,
    prev_indent: Indent,
    pattern: Pattern,
    state: State,
    lists_begin: List[str],
    lists_end: List[str],
) -> Indent:
    """Calculate the indent for a line."""
    diff = get_diff(line, pattern, lists_begin, lists_end)
    back = get_back(line, pattern, state, lists_end)

    actual = prev_indent.actual + diff
    visual = max(0, prev_indent.actual - back)

    return Indent(actual=actual, visual=visual)


def calculate_indent(
    line: str,
    state: State,
    logs: List[Log],
    file: str,
    args: Args,
    pattern: Pattern,
    lists_begin: List[str],
    lists_end: List[str],
) -> Indent:
    """Calculate the indent for a line and update the state."""
    indent = get_indent(line, state.indent, pattern, state, lists_begin, lists_end)

    # Update the state
    state.indent = indent

    # Record the last line with zero indent
    if indent.visual == 0:
        state.linum_last_zero_indent = state.linum_new

    return indent


def apply_indent(line: str, indent: Indent, args: Args, indent_char: str) -> str:
    """Apply indentation to a line."""
    if not line.strip():
        return ""

    indent_str = indent_char * (indent.visual * args.tabsize)
    return indent_str + line.lstrip()


def needs_wrap(line: str, indent_length: int, args: Args) -> bool:
    """Check if a line needs wrapping."""
    return args.wrap and (len(line) + indent_length > args.wraplen)


def find_wrap_point(line: str, indent_length: int, args: Args) -> Optional[int]:
    """Find the best place to break a long line."""
    wrap_point = None
    after_char = False
    prev_char = None

    line_width = 0
    wrap_boundary = args.wrapmin - indent_length

    for i, c in enumerate(line):
        line_width += 1
        if line_width > wrap_boundary and wrap_point is not None:
            break
        if c == ' ' and prev_char != '\\':
            if after_char:
                wrap_point = i
        elif c != '%':
            after_char = True
        prev_char = c

    return wrap_point


def apply_wrap(
    line: str,
    indent_length: int,
    state: State,
    file: str,
    args: Args,
    logs: List[Log],
    pattern: Pattern,
) -> Optional[List[str]]:
    """Wrap a long line into a short prefix and a suffix."""
    if args.verbosity >= 3:  # Trace level
        logs.append(
            Log(
                level="TRACE",
                file=file,
                message="Wrapping long line.",
                linum_new=state.linum_new,
                linum_old=state.linum_old,
                line=line,
            )
        )

    wrap_point = find_wrap_point(line, indent_length, args)
    comment_index = find_comment_index(line, pattern)

    if wrap_point is None or wrap_point > args.wraplen:
        logs.append(
            Log(
                level="WARN",
                file=file,
                message="Line cannot be wrapped.",
                linum_new=state.linum_new,
                linum_old=state.linum_old,
                line=line,
            )
        )
        return None

    this_line = line[:wrap_point]

    if comment_index is not None and wrap_point > comment_index:
        next_line_start = COMMENT_LINE_START
    else:
        next_line_start = TEXT_LINE_START

    next_line = line[wrap_point + 1 :]

    return [this_line, next_line_start, next_line]


def needs_split(line: str, pattern: Pattern) -> bool:
    """Check if line contains content which should be split onto a new line."""
    # Check if we should format this line and if we've matched an environment
    contains_splittable_env = (
        pattern.contains_splitting and RE_SPLITTING_SHARED_LINE.search(line) is not None
    )

    # If we're not ignoring and we've matched an environment...
    if contains_splittable_env:
        # Return True if the comment index is None (which implies the split point must be in text),
        # otherwise compare the index of the comment with the split point
        comment_index = find_comment_index(line, pattern)
        if comment_index is None:
            return True

        match = RE_SPLITTING_SHARED_LINE_CAPTURE.search(line)
        if match and match.start(2) > comment_index:
            # If split point is past the comment index, don't split
            return False
        else:
            # Otherwise, split point is before comment and we do split
            return True
    else:
        # If ignoring or didn't match an environment, don't need a new line
        return False


def split_line(line: str, state: State, file: str, args: Args, logs: List[Log]) -> Tuple[str, str]:
    """Ensure lines are split correctly."""
    match = RE_SPLITTING_SHARED_LINE_CAPTURE.search(line)
    if not match:
        return line, ""

    prev = match.group('prev')
    rest = match.group('env')

    if args.verbosity >= 3:  # Trace level
        logs.append(
            Log(
                level="TRACE",
                file=file,
                message="Placing environment on new line.",
                linum_new=state.linum_new,
                linum_old=state.linum_old,
                line=line,
            )
        )

    return prev, rest


def set_ignore_and_report(
    line: str, temp_state: State, logs: List[Log], file: str, pattern: Pattern
) -> bool:
    """Sets the ignore and verbatim flags in the given State based on line and returns whether line should be ignored."""
    temp_state.ignore = get_ignore(line, temp_state, logs, file, True)
    temp_state.verbatim = get_verbatim(line, temp_state, logs, file, True, pattern)

    return temp_state.verbatim.visual or temp_state.ignore.visual


def clean_text(text: str, args: Args) -> str:
    """Cleans the given text by removing extra line breaks and trailing spaces."""
    # Remove extra newlines
    text = RE_NEWLINES.sub(f"{LINE_END}{LINE_END}", text)

    # Remove tabs if they shouldn't be used
    if args.tabchar != '\t':
        text = text.replace('\t', ' ' * args.tabsize)

    # Remove trailing spaces
    text = RE_TRAIL.sub(LINE_END, text)

    return text


def remove_trailing_spaces(text: str) -> str:
    """Remove trailing spaces from line endings."""
    return RE_TRAIL.sub(LINE_END, text)


def remove_trailing_blank_lines(text: str) -> str:
    """Remove trailing blank lines from file."""
    return text.rstrip() + LINE_END


def indents_return_to_zero(state: State) -> bool:
    """Check if indentation returns to zero at the end of the file."""
    return state.indent.actual == 0


def format_latex(text: str) -> str:
    """Format LaTeX text with default formatting options.

    This is the main API function for formatting LaTeX text.
    It uses pre-defined default values for all formatting parameters.

    Args:
        text: LaTeX text to format

    Returns:
        Formatted LaTeX text
    """
    # Use default configuration
    args = Args()
    file = "input.tex"

    # Format and return only the text
    formatted_text, _ = _format_latex(text, file, args)
    return formatted_text.strip()


def _format_latex(old_text: str, file: str, args: Args) -> Tuple[str, List[Log]]:
    """Internal function to format a LaTeX string."""
    logs = []
    logs.append(Log(level="INFO", file=file, message="Formatting started."))

    # Clean the source file
    old_text = clean_text(old_text, args)
    old_lines = list(enumerate(old_text.splitlines(), 1))

    # Initialize
    state = State()
    queue = []
    new_text = ""

    # Select the character used for indentation
    indent_char = '\t' if args.tabchar == '\t' else ' '

    # Get any extra environments to be indented as lists
    lists_begin = [f"\\begin{{{l}}}" for l in args.lists]
    lists_end = [f"\\end{{{l}}}" for l in args.lists]

    while True:
        if queue:
            linum_old, line = queue.pop(0)

            # Read the patterns present on this line
            pattern = Pattern.new(line)

            # Temporary state for working on this line
            temp_state = State(
                linum_old=linum_old,
                linum_new=state.linum_new,
                ignore=Ignore(state.ignore.actual, state.ignore.visual),
                indent=Indent(state.indent.actual, state.indent.visual),
                verbatim=Verbatim(state.verbatim.actual, state.verbatim.visual),
                linum_last_zero_indent=state.linum_last_zero_indent,
            )

            # If the line should not be ignored...
            if not set_ignore_and_report(line, temp_state, logs, file, pattern):
                # Check if the line should be split because of a pattern that should begin on a new line
                if needs_split(line, pattern):
                    # Split the line into two...
                    this_line, next_line = split_line(line, temp_state, file, args, logs)
                    # ...and queue the second part for formatting
                    if next_line:
                        queue.insert(0, (linum_old, next_line))
                    line = this_line

                # Calculate the indent based on the current state and the patterns in the line
                indent = calculate_indent(
                    line, temp_state, logs, file, args, pattern, lists_begin, lists_end
                )

                indent_length = indent.visual * args.tabsize

                # Wrap the line before applying the indent, and loop back if the line needed wrapping
                if needs_wrap(line.lstrip(), indent_length, args):
                    wrapped_lines = apply_wrap(
                        line.lstrip(), indent_length, temp_state, file, args, logs, pattern
                    )
                    if wrapped_lines:
                        this_line, next_line_start, next_line = wrapped_lines
                        queue.insert(0, (linum_old, next_line_start + next_line))
                        queue.insert(0, (linum_old, this_line))
                        continue

                # Lastly, apply the indent if the line didn't need wrapping
                line = apply_indent(line, indent, args, indent_char)

            # Add line to new text
            state = temp_state
            new_text += line + LINE_END
            state.linum_new += 1
        elif old_lines:
            linum_old, line = old_lines.pop(0)
            queue.append((linum_old, line))
        else:
            break

    if not indents_return_to_zero(state):
        msg = f"Indent does not return to zero. Last non-indented line is line {state.linum_last_zero_indent}"
        logs.append(Log(level="WARN", file=file, message=msg))

    new_text = remove_trailing_spaces(new_text)
    new_text = remove_trailing_blank_lines(new_text)
    logs.append(Log(level="INFO", file=file, message="Formatting complete."))

    return new_text, logs