Source code for parseur.schemas.mailbox

from enum import Enum, IntFlag, auto

from marshmallow import RAISE, fields, validate

from parseur.schemas import BaseSchema


from parseur.schemas.document import DocumentStatus
from parseur.schemas.paserfield import (
    ParserFieldReadSchema,
    ParserFieldWriteSchema,
    TableFieldReadSchema,
)
from parseur.schemas.webhook import WebhookSchema

[docs]SUPPORTED_FILE_EXTENSIONS = frozenset(
    {
        "bmp",
        "csv",
        "doc",
        "docx",
        "eml",
        "gif",
        "html",
        "ics",
        "jpg",
        "mbox",
        "msg",
        "odp",
        "ods",
        "odt",
        "pdf",
        "png",
        "ppt",
        "pptx",
        "rtf",
        "tif",
        "txt",
        "xhtml",
        "xls",
        "xlsm",
        "xlsx",
        "xml",
        "zip",
    }
)


[docs]class Metadata(IntFlag):
    """Per-document metadata columns a mailbox can expose.

    A :class:`~enum.IntFlag`, so columns compose with ``|`` and can be enabled or
    disabled together (see :meth:`parseur.Mailbox.set_metadata`). Single source of
    truth for the ``*_field`` toggles: a member maps to its API field by
    lowercasing its name and appending ``_field`` (see :attr:`field`) — e.g.
    ``Metadata.SUBJECT`` -> ``"subject_field"``. Both the read and write mailbox
    schemas are derived from this enum so the column list lives in one place.
    """

[docs]    ATTACHMENTS = auto()
[docs]    BCC = auto()
[docs]    CC = auto()
[docs]    CONTENT = auto()
[docs]    CREATED_DATE = auto()
[docs]    CREATED = auto()
[docs]    CREATED_TIME = auto()
[docs]    CREDIT_COUNT = auto()
[docs]    DOCUMENT_ID = auto()
[docs]    DOCUMENT_URL = auto()
[docs]    HEADERS = auto()
[docs]    HTML_DOCUMENT = auto()
[docs]    LAST_REPLY = auto()
[docs]    MAILBOX_ID = auto()
[docs]    ORIGINAL_DOCUMENT = auto()
[docs]    ORIGINAL_RECIPIENT = auto()
[docs]    PAGE_COUNT = auto()
[docs]    PARSING_ENGINE = auto()
[docs]    PROCESSED_DATE = auto()
[docs]    PROCESSED = auto()
[docs]    PROCESSED_TIME = auto()
[docs]    PUBLIC_DOCUMENT_URL = auto()
[docs]    RECEIVED_DATE = auto()
[docs]    RECEIVED = auto()
[docs]    RECEIVED_TIME = auto()
[docs]    RECIPIENT = auto()
[docs]    RECIPIENT_SUFFIX = auto()
[docs]    REPLY_TO = auto()
[docs]    SEARCHABLE_PDF = auto()
[docs]    SENDER = auto()
[docs]    SENDER_NAME = auto()
[docs]    SPLIT_PAGE_RANGE = auto()
[docs]    SPLIT_PARENT_ID = auto()
[docs]    SUBJECT = auto()
[docs]    TEMPLATE = auto()
[docs]    TEXT_DOCUMENT = auto()
[docs]    TO = auto()

    @property
[docs]    def field(self) -> str:
        """The mailbox schema field this column maps to (e.g. ``subject_field``)."""
        return f"{self.name.lower()}_field"


# Read/write field mixins for the metadata columns, generated from Metadata so
# the column list is never repeated. Read keeps them lenient (server output);
# write exposes them as optional toggles.
[docs]MetadataReadFields = BaseSchema.from_dict(
    {meta.field: fields.Boolean(allow_none=True) for meta in Metadata},
    name="MetadataReadFields",
)
[docs]MetadataWriteFields = BaseSchema.from_dict(
    {meta.field: fields.Boolean() for meta in Metadata},
    name="MetadataWriteFields",
)


[docs]class AIEngine(str, Enum):
    """
    Enumeration of AI engines that can be set on a mailbox when creating or
    updating it.

    The values mirror the ``parser.ai_engine`` choices returned by the
    ``/bootstrap`` endpoint.

    Members:

    - `DISABLED`: No AI engine (template-based parsing only).
    - `GCP_AI_2_5`: AI Text engine v2.5 (analyzes extracted text).
    - `GCP_AI_3_TXT`: AI Text engine v3 (analyzes extracted text).
    - `GCP_AI_2`: AI Vision engine v3 (understands layout and images).
    """

[docs]    DISABLED = "DISABLED"
[docs]    GCP_AI_2_5 = "GCP_AI_2_5"
[docs]    GCP_AI_3_TXT = "GCP_AI_3_TXT"
[docs]    GCP_AI_2 = "GCP_AI_2"


[docs]class IdentificationStatus(str, Enum):
    """Identification status accepted when creating/updating a mailbox."""

[docs]    REQUESTED = "REQUESTED"
[docs]    PROGRESS = "PROGRESS"
[docs]    COMPLETED = "COMPLETED"
[docs]    MANUAL = "MANUAL"


[docs]class DateFormat(str, Enum):
    """How to read ambiguous dates in documents (the ``input_date_format``)."""

[docs]    MONTH_FIRST = "MONTH_FIRST"  # mm/dd/yyyy, mm-dd-yyyy, ...
[docs]    DAY_FIRST = "DAY_FIRST"  # dd/mm/yyyy, dd-mm-yyyy, ...


[docs]class DecimalSeparator(str, Enum):
    """Decimal separator for numbers in documents (the ``decimal_separator``)."""

[docs]    DOT = "."  # 123.45
[docs]    COMMA = ","  # 123,45


[docs]class PageRangeSchema(BaseSchema):
[docs]    start_index = fields.Int(required=True)
[docs]    end_index = fields.Int(allow_none=True)


[docs]class SplitKeyWordsSchema(BaseSchema):
[docs]    is_before = fields.Boolean(required=True)
[docs]    keyword = fields.String(required=True)


[docs]class MailboxBaseSchema(BaseSchema):
    """
    Mailbox settings shared by the read and write representations.

    These fields have the same definition whether a mailbox is read from the
    API or sent to it, so they are declared once here and inherited by both
    :class:`MailboxReadSchema` and :class:`MailboxWriteSchema`.
    """

[docs]    ai_instructions = fields.String(allow_none=True)

[docs]    decimal_separator = fields.String(
        allow_none=True,
        # "" kept for read leniency (a mailbox may report no override).
        validate=validate.OneOf(
            [e.value for e in DecimalSeparator] + [""],
            error="Must be '.', ',' or null.",
        ),
    )
[docs]    default_timezone = fields.String(allow_none=True)

[docs]    default_language = fields.String(allow_none=True)

    # Input date format for parsing dates. Accepts "MONTH_FIRST", "DAY_FIRST", or None.
    #   MONTH_FIRST: mm/dd/yyyy, mm-dd-yyyy
    #   DAY_FIRST: dd/mm/yyyy, dd-mm-yyyy
[docs]    input_date_format = fields.String(
        allow_none=True,
        validate=validate.OneOf(
            [e.value for e in DateFormat],
            error="Must be 'MONTH_FIRST', 'DAY_FIRST', or null.",
        ),
    )
    # Parseur will automatically delete documents once they get older than the selected threshold.
[docs]    retention_policy = fields.Int(allow_none=True)

    # List of allowed file extensions for document processing.
    #   Example: ["pdf", "docx", "png"]
[docs]    allowed_extensions = fields.List(fields.String(), allow_none=True)

    # Email sender block/allow list.
    #   True = allowlist mode (only allow listed senders).
    #   False = blocklist mode (block listed senders).
[docs]    use_whitelist_instead_of_blacklist = fields.Boolean(allow_none=True)
[docs]    emails_or_domains = fields.List(fields.String(), allow_none=True)

    # Page processing: only this page ranges. (same as split_page_range_set)
[docs]    page_range_set = fields.Nested(PageRangeSchema, allow_none=True, many=True)

    # Split documents every N pages.
[docs]    split_page = fields.Int(allow_none=True)
    # Split documents by page ranges.
    #   Example input: 1-5, 8, 11-13
    #   Enter ranges separated by commas. Use brackets to count from the end.
    #   E.g., (1) is last page. Example: 1, 2-(1) splits into two docs:
    #   - first page only
    #   - from page 2 to the end.
[docs]    split_page_range_set = fields.Nested(PageRangeSchema, allow_none=True, many=True)
    # Split documents by keywords.
    #   Enter the list of keywords to split on.
    #   Supports splitting before or after keywords.
    #   Keywords are case-sensitive.
[docs]    split_keywords = fields.Nested(SplitKeyWordsSchema, allow_none=True, many=True)


[docs]class MailboxReadSchema(MailboxBaseSchema, MetadataReadFields):
    """Schema for a mailbox as returned by the API."""

[docs]    id = fields.Int(required=True)
[docs]    name = fields.String(required=True)
[docs]    email_prefix = fields.String(required=True)
[docs]    account_uuid = fields.String(required=True)

[docs]    ai_engine = fields.String(required=True)
    # AI document splitting
[docs]    is_ai_split_enabled = fields.Boolean(allow_none=True)
[docs]    ai_split_instructions = fields.String(allow_none=True)

    # Email processing: process emails and attachments.
[docs]    process_attachments = fields.Boolean(required=True)
    # Email processing: process attachments only. Skip emails.
[docs]    attachments_only = fields.Boolean(required=True)

    # Page processing: only even pages (2, 4, 6, ...) / odd pages (1, 3, 5, ...)
[docs]    even_pages = fields.Boolean(required=True)
[docs]    odd_pages = fields.Boolean(required=True)

    # Counters
[docs]    document_count = fields.Int(allow_none=True)
[docs]    webhook_count = fields.Int(allow_none=True)
[docs]    template_count = fields.Int(allow_none=True)
[docs]    parser_object_count = fields.Int(allow_none=True)
    # Document per status count
[docs]    document_per_status_count = fields.Dict(
        keys=fields.String(validate=validate.OneOf([e.value for e in DocumentStatus])),
        values=fields.Int(),
        required=True,
    )

    # Last activity and modification timestamps
[docs]    last_activity = fields.DateTime(allow_none=True)
[docs]    template_set_last_modified = fields.DateTime(allow_none=True)
[docs]    parser_object_set_last_modified = fields.DateTime(allow_none=True)

    # URLs
[docs]    csv_download = fields.String(allow_none=True)
[docs]    json_download = fields.String(allow_none=True)
[docs]    xls_download = fields.String(allow_none=True)

    # Webhooks
[docs]    available_webhook_set = fields.List(fields.Nested(WebhookSchema), required=True)
[docs]    webhook_set = fields.List(fields.Nested(WebhookSchema), required=True)

    # Parser and tables fields
[docs]    table_set = fields.List(fields.Nested(TableFieldReadSchema))
[docs]    parser_object_set = fields.List(fields.Nested(ParserFieldReadSchema))


[docs]class MailboxWriteSchema(MailboxBaseSchema, MetadataWriteFields):
    """
    Schema describing the writable fields of a mailbox.

    Used to validate and serialize the request body sent when creating or
    updating a mailbox. Every field is optional, but unknown fields are
    rejected so that typos and read-only fields never reach the API silently.
    """

[docs]    class Meta:
        # Reject unknown fields instead of silently dropping them, so a
        # mistyped or read-only field raises a ValidationError.
[docs]        unknown = RAISE
[docs]        ordered = True

[docs]    name = fields.String()
[docs]    ai_engine = fields.String(
        validate=validate.OneOf(
            [e.value for e in AIEngine],
            error="Must be one of: " + ", ".join(e.value for e in AIEngine) + ".",
        )
    )
[docs]    identification_status = fields.String(
        validate=validate.OneOf([e.value for e in IdentificationStatus])
    )

[docs]    process_attachments = fields.Boolean()
[docs]    attachments_only = fields.Boolean()
[docs]    even_pages = fields.Boolean()
[docs]    odd_pages = fields.Boolean()

    # AI document splitting (the page-range / N-page / keyword split fields are
    # inherited as writable from MailboxBaseSchema).
[docs]    is_ai_split_enabled = fields.Boolean(allow_none=True)
[docs]    ai_split_instructions = fields.String(allow_none=True)

    # "Files to process": validated against the supported extensions on write
[docs]    allowed_extensions = fields.List(
        fields.String(validate=validate.OneOf(sorted(SUPPORTED_FILE_EXTENSIONS))),
        allow_none=True,
    )

    # Field definitions, validated against the writable field schema.
[docs]    parser_object_set = fields.Nested(
        ParserFieldWriteSchema, many=True, allow_none=True
    )


[docs]class MailboxCreateSchema(MailboxWriteSchema):
    """Validate/serialize the body of a ``POST /parser`` (create) request."""


[docs]class MailboxUpdateSchema(MailboxWriteSchema):
    """Validate/serialize the body of a ``PUT /parser/{id}`` (update) request."""

[docs]    id = fields.Int(required=True)