"""
app/transformation_map.py
-----------------------------------------------------------------------------
Clause-Level Alignment Layer (Transformation Map) for the Axis Descriptor Lab.
Why a dedicated module?
-----------------------
The word-level diff (client-side LCS) is too granular — clause rewrites appear
as a long sequence of single-word insertions and deletions, obscuring the
structural change. The signal isolation layer (``signal_isolation.py``) is
lexically useful but structure-blind (set difference, not positional).
The Transformation Map fills the gap by extracting clause-scale replacement
pairs — showing *what chunk of text was replaced by what chunk* — without
semantic interpretation.
Pipeline (sentence-aware alignment)
------------------------------------
1. **Normalise** — collapse whitespace, strip edges.
2. **Sentence split** — ``nltk.sent_tokenize()`` on both texts.
3. **Sentence alignment** — ``difflib.SequenceMatcher`` on sentence lists
to pair sentences (equal, replace, insert, delete).
4. **Token-level alignment within matched sentence pairs** — for each
"replace" sentence pair, run ``difflib.SequenceMatcher`` on
``nltk.word_tokenize()`` tokens and extract "replace" opcodes.
5. **For "equal" sentence pairs** — skip (no changes).
6. **For insert/delete-only sentences** — optionally included via the
``include_all`` parameter. When False (default), only replace
operations are shown. When True, inserts and deletes appear as
rows with an empty removed or added side.
Noise reduction
---------------
- Ignore replacements where both sides are a single stopword.
- Merge adjacent replace operations into a single row.
- Normalise whitespace before alignment.
NLTK data requirements
----------------------
Reuses the same NLTK data packages as ``signal_isolation.py``:
``punkt_tab``, ``stopwords``. These resources are validated explicitly at
call time rather than being downloaded during module import.
"""
from __future__ import annotations
import difflib
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from app.nltk_support import ensure_nltk_data, english_stopwords
# -----------------------------------------------------------------------------
# Private helpers
# -----------------------------------------------------------------------------
def _normalise_whitespace(text: str) -> str:
"""Collapse runs of whitespace to single spaces and strip edges."""
return re.sub(r"\s+", " ", text).strip()
def _is_single_stopword(text: str) -> bool:
"""Return True if *text* is a single English stopword (case-insensitive)."""
stripped = text.strip().lower()
# Must be a single token with no internal spaces after stripping
if " " in stripped:
return False
return stripped in english_stopwords()
def _extract_token_changes(
sentences_a: list[str],
sentences_b: list[str],
*,
include_all: bool = False,
) -> list[dict[str, str]]:
"""
Run token-level alignment on paired sentence groups and extract
change spans.
Parameters
----------
sentences_a : Sentence(s) from the baseline side of a replace opcode.
sentences_b : Sentence(s) from the current side of a replace opcode.
include_all : When True, include insert and delete opcodes as well as
replacements. When False, only replacements are returned.
Returns
-------
List of {"removed": ..., "added": ...} dicts for each change opcode
found at the token level, after noise filtering.
"""
# Join sentence groups into single strings for token-level comparison
text_a = " ".join(sentences_a)
text_b = " ".join(sentences_b)
ensure_nltk_data()
tokens_a = word_tokenize(text_a)
tokens_b = word_tokenize(text_b)
matcher = difflib.SequenceMatcher(None, tokens_a, tokens_b)
rows: list[dict[str, str]] = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == "equal":
continue
if tag == "replace":
removed = " ".join(tokens_a[i1:i2])
added = " ".join(tokens_b[j1:j2])
# Noise reduction: skip if both sides are a single stopword
if _is_single_stopword(removed) and _is_single_stopword(added):
continue
rows.append({"removed": removed, "added": added})
elif tag == "delete" and include_all:
removed = " ".join(tokens_a[i1:i2])
rows.append({"removed": removed, "added": ""})
elif tag == "insert" and include_all:
added = " ".join(tokens_b[j1:j2])
rows.append({"removed": "", "added": added})
return rows
def _merge_adjacent(rows: list[dict[str, str]]) -> list[dict[str, str]]:
"""
Merge adjacent replacement rows into single rows.
Two rows are "adjacent" when they appear consecutively in the list
(which preserves document order from the SequenceMatcher opcodes).
Merging them produces a single row whose removed/added text is the
concatenation separated by a space.
"""
if not rows:
return []
merged: list[dict[str, str]] = [rows[0].copy()]
for row in rows[1:]:
# Always merge consecutive rows — they represent adjacent replace
# opcodes from the same sentence pair, which together form a
# single clause-level substitution.
merged[-1] = {
"removed": merged[-1]["removed"] + " " + row["removed"],
"added": merged[-1]["added"] + " " + row["added"],
}
return merged
# -----------------------------------------------------------------------------
# Public API
# -----------------------------------------------------------------------------