from marshmallow import fields, validate
from parseur.schemas import BaseSchema
from parseur.schemas.document import DocumentStatus
from parseur.schemas.paserfield import ParserFieldSchema, TableFieldSchema
from parseur.schemas.webhook import WebhookSchema
[docs]class SplitKeyWordsSchema(BaseSchema):
[docs] is_before = fields.Boolean(required=True)
[docs] keyword = fields.String(required=True)
[docs]class MailboxSchema(BaseSchema):
[docs] id = fields.Int(required=True)
[docs] name = fields.String(required=True)
[docs] email_prefix = fields.String(required=True)
[docs] account_uuid = fields.String(required=True)
[docs] ai_engine = fields.String(required=True)
[docs] ai_instructions = fields.String(allow_none=True)
# ################
# Basic settings #
# ################
[docs] decimal_separator = fields.String(
allow_none=True,
validate=validate.OneOf([".", ",", ""], error="Must be '.', ',' or null."),
)
[docs] default_timezone = fields.String(allow_none=True)
[docs] default_language = fields.String(allow_none=True)
# Input date format for parsing dates. Accepts "MONTH_FIRST", "DAY_FIRST", or None.
# MONTH_FIRST: mm/dd/yyyy, mm-dd-yyyy
# DAY_FIRST: dd/mm/yyyy, dd-mm-yyyy
# Disable auto-detection of input_date_format
# Parseur will automatically delete documents once they get older than the selected threshold.
[docs] retention_policy = fields.Int(allow_none=True)
# ###################
# Advanced settings #
# ###################
# List of allowed file extensions for document processing.
# Example: ["pdf", "docx", "png"]
[docs] allowed_extensions = fields.List(fields.String(), allow_none=True)
# Force use of OCR on PDFs. Enable if data is garbled or text is in images.
# Reprocess documents after enabling. May slow down processing.
[docs] force_ocr = fields.Boolean(allow_none=True)
# Expand field names in JSON Result.
# Example: "user.name": "John" -> {"user": {"name": "John"}}.
[docs] expand_result = fields.Boolean(allow_none=True)
# Disable links on documents. Useful for manual data entry.
[docs] disable_document_links = fields.Boolean(allow_none=True)
# Disable the deskew algorithm if it creates a staircase effect when straightening.
[docs] disable_deskew = fields.Boolean(allow_none=True)
# Extract XML from HTML comments into separate documents.
# Email sender block/allow list.
# True = allowlist mode (only allow listed senders).
# False = blocklist mode (block listed senders).
[docs] use_whitelist_instead_of_blacklist = fields.Boolean(allow_none=True)
[docs] emails_or_domains = fields.List(fields.String(), allow_none=True)
# Email processing: process emails and attachments.
[docs] process_attachments = fields.Boolean(required=True)
# Email processing: process attachments only. Skip emails.
[docs] attachments_only = fields.Boolean(required=True)
# Page processing: only even pages (2, 4, 6, ...)
[docs] even_pages = fields.Boolean(required=True)
# Page processing: only odd pages (1, 3, 5, ...)
[docs] odd_pages = fields.Boolean(required=True)
# Page processing: only this page ranges. (same as split_page_range_set)
[docs] page_range_set = fields.Nested(PageRangeSchema, allow_none=True, many=True)
# Split documents every N pages.
[docs] split_page = fields.Int(allow_none=True)
# Split documents by page ranges.
# Example input: 1-5, 8, 11-13
# Enter ranges separated by commas. Use brackets to count from the end.
# E.g., (1) is last page. Example: 1, 2-(1) splits into two docs:
# - first page only
# - from page 2 to the end.
[docs] split_page_range_set = fields.Nested(PageRangeSchema, allow_none=True, many=True)
# Split documents by keywords.
# Enter the list of keywords to split on.
# Supports splitting before or after keywords.
# Keywords are case-sensitive.
[docs] split_keywords = fields.Nested(SplitKeyWordsSchema, allow_none=True, many=True)
# Counters
[docs] document_count = fields.Int(allow_none=True)
[docs] webhook_count = fields.Int(allow_none=True)
[docs] template_count = fields.Int(allow_none=True)
[docs] parser_object_count = fields.Int(allow_none=True)
# Document per status count
[docs] document_per_status_count = fields.Dict(
keys=fields.String(validate=validate.OneOf([e.value for e in DocumentStatus])),
values=fields.Int(),
required=True,
)
# Last activity and modification timestamps
[docs] last_activity = fields.DateTime(allow_none=True)
[docs] template_set_last_modified = fields.DateTime(allow_none=True)
[docs] parser_object_set_last_modified = fields.DateTime(allow_none=True)
# URLs
[docs] csv_download = fields.String(allow_none=True)
[docs] json_download = fields.String(allow_none=True)
[docs] xls_download = fields.String(allow_none=True)
# Metadata fields
[docs] attachments_field = fields.Boolean(required=True)
[docs] bcc_field = fields.Boolean(required=True)
[docs] cc_field = fields.Boolean(required=True)
[docs] content_field = fields.Boolean(required=True)
[docs] credit_count_field = fields.Boolean(required=True)
[docs] document_id_field = fields.Boolean(required=True)
[docs] document_url_field = fields.Boolean(required=True)
[docs] html_document_field = fields.Boolean(required=True)
[docs] last_reply_field = fields.Boolean(required=True)
[docs] mailbox_id_field = fields.Boolean(required=True)
[docs] original_document_field = fields.Boolean(required=True)
[docs] original_recipient_field = fields.Boolean(required=True)
[docs] page_count_field = fields.Boolean(required=True)
[docs] parsing_engine_field = fields.Boolean(required=True)
[docs] processed_date_field = fields.Boolean(required=True)
[docs] processed_field = fields.Boolean(required=True)
[docs] processed_time_field = fields.Boolean(required=True)
[docs] public_document_url_field = fields.Boolean(required=True)
[docs] received_date_field = fields.Boolean(required=True)
[docs] received_field = fields.Boolean(required=True)
[docs] received_time_field = fields.Boolean(required=True)
[docs] recipient_field = fields.Boolean(required=True)
[docs] recipient_suffix_field = fields.Boolean(required=True)
[docs] reply_to_field = fields.Boolean(required=True)
[docs] searchable_pdf_field = fields.Boolean(required=True)
[docs] sender_field = fields.Boolean(required=True)
[docs] sender_name_field = fields.Boolean(required=True)
[docs] split_page_range_field = fields.Boolean(required=True)
[docs] split_parent_id_field = fields.Boolean(required=True)
[docs] subject_field = fields.Boolean(required=True)
[docs] template_field = fields.Boolean(required=True)
[docs] text_document_field = fields.Boolean(required=True)
[docs] to_field = fields.Boolean(required=True)
# Webhooks
[docs] available_webhook_set = fields.List(fields.Nested(WebhookSchema), required=True)
[docs] webhook_set = fields.List(fields.Nested(WebhookSchema), required=True)
# Parser and tables fields
[docs] table_set = fields.List(fields.Nested(TableFieldSchema))
[docs] parser_object_set = fields.List(fields.Nested(ParserFieldSchema))