Source code for app.schema.analysis

"""
Schemas for signal isolation and transformation-map analysis endpoints.

These models describe the deterministic NLP analysis APIs that compare a
baseline output against a current output.  They are intentionally isolated
from generation and save concerns so the analysis surface stays easy to scan.
"""

from __future__ import annotations

from pydantic import BaseModel, Field


[docs] class IndicatorConfig(BaseModel): """ Optional tuning parameters for micro-indicator detection. Sent as an optional field in :class:`TransformationMapRequest`. When absent, conservative defaults apply. All thresholds are designed to minimise false positives — users can loosen them for exploratory use. """ compression_ratio: float = Field( default=2.0, ge=1.0, description=( "Minimum ratio of removed/added token counts to flag " "'compression'. Default 2.0 means removed must be at least " "twice as long as added." ), ) expansion_ratio: float = Field( default=2.0, ge=1.0, description="Minimum ratio of added/removed token counts to flag 'expansion'. Default 2.0.", ) min_tokens: int = Field( default=2, ge=1, description=( "Minimum token count on the larger side to consider " "compression/expansion. Prevents flagging single-word swaps." ), ) modality_density_threshold: float = Field( default=0.3, ge=0.0, le=1.0, description=( "Minimum absolute change in verb+adjective density " "(proportion of tokens) to flag 'modality shift'. " "Default 0.3 (conservative)." ), ) enabled: list[str] | None = Field( default=None, description=( "When not None, only compute indicators whose names appear " "in this list. None means all indicators are active." ), )
[docs] class TransformationMapRequest(BaseModel): """ Request body for ``POST /api/transformation-map``. Accepts two plain-text strings (baseline and current) for clause-level alignment analysis. The backend runs sentence-aware alignment followed by token-level diffing to extract replacement pairs, then classifies each row with structural micro-indicators. """ baseline_text: str = Field( ..., min_length=1, description="The reference text (A) — typically the stored baseline output. Must not be empty.", ) current_text: str = Field( ..., min_length=1, description=( "The comparison text (B) — typically the latest generated output. " "Must not be empty." ), ) include_all: bool = Field( default=False, description=( "When True, include insert-only and delete-only operations as " "rows (with an empty removed or added side). When False " "(default), only replacement operations are returned." ), ) indicator_config: IndicatorConfig | None = Field( default=None, description=( "Optional tuning parameters for micro-indicator detection. " "When absent, conservative defaults apply." ), )
[docs] class TransformationMapRow(BaseModel): """ A single clause-level replacement pair with optional micro-indicators. The ``indicators`` field contains zero or more structural pattern labels computed by the ``micro_indicators`` module. Each label is a deterministic heuristic tag such as ``"compression"``, ``"embodiment shift"``, or ``"intensity ↑"``. """ removed: str = Field( ..., description="The text chunk from the baseline (A) that was replaced.", ) added: str = Field( ..., description="The text chunk from the current text (B) that replaced it.", ) indicators: list[str] = Field( default_factory=list, description=( "Micro-indicator labels for this row (e.g. 'compression', " "'embodiment shift'). Empty list if no indicators match. " "Computed server-side by deterministic heuristics." ), )
[docs] class TransformationMapResponse(BaseModel): """ Response body for ``POST /api/transformation-map``. Contains a list of clause-level replacement pairs extracted by sentence-aware alignment and token-level diffing. """ rows: list[TransformationMapRow] = Field( ..., description=( "Ordered list of clause-level replacement pairs. Each row shows " "text removed from A and the corresponding text added in B." ), )
[docs] class DeltaRequest(BaseModel): """ Request body for ``POST /api/analyze-delta``. Accepts two plain-text strings (baseline and current) for signal isolation analysis. The backend runs both through the NLP pipeline (tokenise → lemmatise → filter stopwords) and returns the set difference as sorted word lists. """ baseline_text: str = Field( ..., min_length=1, description="The reference text (A) — typically the stored baseline output. Must not be empty.", examples=["The weathered figure stands near the threshold."], ) current_text: str = Field( ..., min_length=1, description=( "The comparison text (B) — typically the latest generated output. " "Must not be empty." ), examples=["A dark goblin lurks beyond the crumbling gate."], )
[docs] class DeltaResponse(BaseModel): """ Response body for ``POST /api/analyze-delta``. Contains two alphabetically sorted lists of content lemmas that represent meaningful lexical differences between the baseline and current texts, after stopword removal and lemmatisation. The lists are **set differences**, not positional diffs: - ``removed`` = content lemmas in A but absent from B. - ``added`` = content lemmas in B but absent from A. """ removed: list[str] = Field( ..., description=( "Content lemmas present in the baseline (A) but absent from the " "current text (B). Alphabetically sorted." ), ) added: list[str] = Field( ..., description=( "Content lemmas present in the current text (B) but absent from " "the baseline (A). Alphabetically sorted." ), )