Source code for app.schema.analysis

"""
Schemas for signal isolation and transformation-map analysis endpoints.

These models describe the deterministic NLP analysis APIs that compare a
baseline output against a current output.  They are intentionally isolated
from generation and save concerns so the analysis surface stays easy to scan.
"""

from __future__ import annotations

from pydantic import BaseModel, Field



[docs]
class IndicatorConfig(BaseModel):
    """
    Optional tuning parameters for micro-indicator detection.

    Sent as an optional field in :class:`TransformationMapRequest`.  When
    absent, conservative defaults apply.  All thresholds are designed to
    minimise false positives — users can loosen them for exploratory use.
    """

    compression_ratio: float = Field(
        default=2.0,
        ge=1.0,
        description=(
            "Minimum ratio of removed/added token counts to flag "
            "'compression'.  Default 2.0 means removed must be at least "
            "twice as long as added."
        ),
    )
    expansion_ratio: float = Field(
        default=2.0,
        ge=1.0,
        description="Minimum ratio of added/removed token counts to flag 'expansion'.  Default 2.0.",
    )
    min_tokens: int = Field(
        default=2,
        ge=1,
        description=(
            "Minimum token count on the larger side to consider "
            "compression/expansion.  Prevents flagging single-word swaps."
        ),
    )
    modality_density_threshold: float = Field(
        default=0.3,
        ge=0.0,
        le=1.0,
        description=(
            "Minimum absolute change in verb+adjective density "
            "(proportion of tokens) to flag 'modality shift'.  "
            "Default 0.3 (conservative)."
        ),
    )
    enabled: list[str] | None = Field(
        default=None,
        description=(
            "When not None, only compute indicators whose names appear "
            "in this list.  None means all indicators are active."
        ),
    )




[docs]
class TransformationMapRequest(BaseModel):
    """
    Request body for ``POST /api/transformation-map``.

    Accepts two plain-text strings (baseline and current) for clause-level
    alignment analysis.  The backend runs sentence-aware alignment followed
    by token-level diffing to extract replacement pairs, then classifies
    each row with structural micro-indicators.
    """

    baseline_text: str = Field(
        ...,
        min_length=1,
        description="The reference text (A) — typically the stored baseline output.  Must not be empty.",
    )
    current_text: str = Field(
        ...,
        min_length=1,
        description=(
            "The comparison text (B) — typically the latest generated output.  "
            "Must not be empty."
        ),
    )
    include_all: bool = Field(
        default=False,
        description=(
            "When True, include insert-only and delete-only operations as "
            "rows (with an empty removed or added side).  When False "
            "(default), only replacement operations are returned."
        ),
    )
    indicator_config: IndicatorConfig | None = Field(
        default=None,
        description=(
            "Optional tuning parameters for micro-indicator detection.  "
            "When absent, conservative defaults apply."
        ),
    )




[docs]
class TransformationMapRow(BaseModel):
    """
    A single clause-level replacement pair with optional micro-indicators.

    The ``indicators`` field contains zero or more structural pattern labels
    computed by the ``micro_indicators`` module.  Each label is a
    deterministic heuristic tag such as ``"compression"``,
    ``"embodiment shift"``, or ``"intensity ↑"``.
    """

    removed: str = Field(
        ...,
        description="The text chunk from the baseline (A) that was replaced.",
    )
    added: str = Field(
        ...,
        description="The text chunk from the current text (B) that replaced it.",
    )
    indicators: list[str] = Field(
        default_factory=list,
        description=(
            "Micro-indicator labels for this row (e.g. 'compression', "
            "'embodiment shift').  Empty list if no indicators match.  "
            "Computed server-side by deterministic heuristics."
        ),
    )




[docs]
class TransformationMapResponse(BaseModel):
    """
    Response body for ``POST /api/transformation-map``.

    Contains a list of clause-level replacement pairs extracted by
    sentence-aware alignment and token-level diffing.
    """

    rows: list[TransformationMapRow] = Field(
        ...,
        description=(
            "Ordered list of clause-level replacement pairs.  Each row shows "
            "text removed from A and the corresponding text added in B."
        ),
    )




[docs]
class DeltaRequest(BaseModel):
    """
    Request body for ``POST /api/analyze-delta``.

    Accepts two plain-text strings (baseline and current) for signal
    isolation analysis.  The backend runs both through the NLP pipeline
    (tokenise → lemmatise → filter stopwords) and returns the set
    difference as sorted word lists.
    """

    baseline_text: str = Field(
        ...,
        min_length=1,
        description="The reference text (A) — typically the stored baseline output.  Must not be empty.",
        examples=["The weathered figure stands near the threshold."],
    )
    current_text: str = Field(
        ...,
        min_length=1,
        description=(
            "The comparison text (B) — typically the latest generated output.  "
            "Must not be empty."
        ),
        examples=["A dark goblin lurks beyond the crumbling gate."],
    )




[docs]
class DeltaResponse(BaseModel):
    """
    Response body for ``POST /api/analyze-delta``.

    Contains two alphabetically sorted lists of content lemmas that
    represent meaningful lexical differences between the baseline and
    current texts, after stopword removal and lemmatisation.

    The lists are **set differences**, not positional diffs:

    - ``removed`` = content lemmas in A but absent from B.
    - ``added``   = content lemmas in B but absent from A.
    """

    removed: list[str] = Field(
        ...,
        description=(
            "Content lemmas present in the baseline (A) but absent from the "
            "current text (B).  Alphabetically sorted."
        ),
    )
    added: list[str] = Field(
        ...,
        description=(
            "Content lemmas present in the current text (B) but absent from "
            "the baseline (A).  Alphabetically sorted."
        ),
    )