Source code for parseur.schemas.mailbox

from marshmallow import fields, validate

from parseur.schemas import BaseSchema
from parseur.schemas.document import DocumentStatus
from parseur.schemas.paserfield import ParserFieldSchema, TableFieldSchema
from parseur.schemas.webhook import WebhookSchema


[docs]class PageRangeSchema(BaseSchema):
[docs] start_index = fields.Int(required=True)
[docs] end_index = fields.Int(allow_none=True)
[docs]class SplitKeyWordsSchema(BaseSchema):
[docs] is_before = fields.Boolean(required=True)
[docs] keyword = fields.String(required=True)
[docs]class MailboxSchema(BaseSchema):
[docs] id = fields.Int(required=True)
[docs] name = fields.String(required=True)
[docs] email_prefix = fields.String(required=True)
[docs] account_uuid = fields.String(required=True)
[docs] ai_engine = fields.String(required=True)
[docs] ai_instructions = fields.String(allow_none=True)
# ################ # Basic settings # # ################
[docs] decimal_separator = fields.String( allow_none=True, validate=validate.OneOf([".", ",", ""], error="Must be '.', ',' or null."), )
[docs] default_timezone = fields.String(allow_none=True)
[docs] default_language = fields.String(allow_none=True)
# Input date format for parsing dates. Accepts "MONTH_FIRST", "DAY_FIRST", or None. # MONTH_FIRST: mm/dd/yyyy, mm-dd-yyyy # DAY_FIRST: dd/mm/yyyy, dd-mm-yyyy
[docs] input_date_format = fields.String( allow_none=True, validate=validate.OneOf( ["MONTH_FIRST", "DAY_FIRST"], error="Must be 'MONTH_FIRST', 'DAY_FIRST', or null.", ), )
# Disable auto-detection of input_date_format
[docs] input_date_format_autodetection = fields.Boolean()
# Parseur will automatically delete documents once they get older than the selected threshold.
[docs] retention_policy = fields.Int(allow_none=True)
# ################### # Advanced settings # # ################### # List of allowed file extensions for document processing. # Example: ["pdf", "docx", "png"]
[docs] allowed_extensions = fields.List(fields.String(), allow_none=True)
# Force use of OCR on PDFs. Enable if data is garbled or text is in images. # Reprocess documents after enabling. May slow down processing.
[docs] force_ocr = fields.Boolean(allow_none=True)
# Expand field names in JSON Result. # Example: "user.name": "John" -> {"user": {"name": "John"}}.
[docs] expand_result = fields.Boolean(allow_none=True)
# Disable links on documents. Useful for manual data entry. # Disable the deskew algorithm if it creates a staircase effect when straightening.
[docs] disable_deskew = fields.Boolean(allow_none=True)
# Extract XML from HTML comments into separate documents.
[docs] extract_xml_from_comment = fields.Boolean(allow_none=True)
# Email sender block/allow list. # True = allowlist mode (only allow listed senders). # False = blocklist mode (block listed senders).
[docs] use_whitelist_instead_of_blacklist = fields.Boolean(allow_none=True)
[docs] emails_or_domains = fields.List(fields.String(), allow_none=True)
# Email processing: process emails and attachments.
[docs] process_attachments = fields.Boolean(required=True)
# Email processing: process attachments only. Skip emails.
[docs] attachments_only = fields.Boolean(required=True)
# Page processing: only even pages (2, 4, 6, ...)
[docs] even_pages = fields.Boolean(required=True)
# Page processing: only odd pages (1, 3, 5, ...)
[docs] odd_pages = fields.Boolean(required=True)
# Page processing: only this page ranges. (same as split_page_range_set)
[docs] page_range_set = fields.Nested(PageRangeSchema, allow_none=True, many=True)
# Split documents every N pages.
[docs] split_page = fields.Int(allow_none=True)
# Split documents by page ranges. # Example input: 1-5, 8, 11-13 # Enter ranges separated by commas. Use brackets to count from the end. # E.g., (1) is last page. Example: 1, 2-(1) splits into two docs: # - first page only # - from page 2 to the end.
[docs] split_page_range_set = fields.Nested(PageRangeSchema, allow_none=True, many=True)
# Split documents by keywords. # Enter the list of keywords to split on. # Supports splitting before or after keywords. # Keywords are case-sensitive.
[docs] split_keywords = fields.Nested(SplitKeyWordsSchema, allow_none=True, many=True)
# Counters
[docs] document_count = fields.Int(allow_none=True)
[docs] webhook_count = fields.Int(allow_none=True)
[docs] template_count = fields.Int(allow_none=True)
[docs] parser_object_count = fields.Int(allow_none=True)
# Document per status count
[docs] document_per_status_count = fields.Dict( keys=fields.String(validate=validate.OneOf([e.value for e in DocumentStatus])), values=fields.Int(), required=True, )
# Last activity and modification timestamps
[docs] last_activity = fields.DateTime(allow_none=True)
[docs] template_set_last_modified = fields.DateTime(allow_none=True)
[docs] parser_object_set_last_modified = fields.DateTime(allow_none=True)
# URLs
[docs] csv_download = fields.String(allow_none=True)
[docs] json_download = fields.String(allow_none=True)
[docs] xls_download = fields.String(allow_none=True)
# Metadata fields
[docs] attachments_field = fields.Boolean(required=True)
[docs] bcc_field = fields.Boolean(required=True)
[docs] cc_field = fields.Boolean(required=True)
[docs] content_field = fields.Boolean(required=True)
[docs] credit_count_field = fields.Boolean(required=True)
[docs] document_id_field = fields.Boolean(required=True)
[docs] document_url_field = fields.Boolean(required=True)
[docs] headers_field = fields.Boolean(required=True)
[docs] html_document_field = fields.Boolean(required=True)
[docs] last_reply_field = fields.Boolean(required=True)
[docs] mailbox_id_field = fields.Boolean(required=True)
[docs] original_document_field = fields.Boolean(required=True)
[docs] original_recipient_field = fields.Boolean(required=True)
[docs] page_count_field = fields.Boolean(required=True)
[docs] parsing_engine_field = fields.Boolean(required=True)
[docs] processed_date_field = fields.Boolean(required=True)
[docs] processed_field = fields.Boolean(required=True)
[docs] processed_time_field = fields.Boolean(required=True)
[docs] public_document_url_field = fields.Boolean(required=True)
[docs] received_date_field = fields.Boolean(required=True)
[docs] received_field = fields.Boolean(required=True)
[docs] received_time_field = fields.Boolean(required=True)
[docs] recipient_field = fields.Boolean(required=True)
[docs] recipient_suffix_field = fields.Boolean(required=True)
[docs] reply_to_field = fields.Boolean(required=True)
[docs] searchable_pdf_field = fields.Boolean(required=True)
[docs] sender_field = fields.Boolean(required=True)
[docs] sender_name_field = fields.Boolean(required=True)
[docs] split_page_range_field = fields.Boolean(required=True)
[docs] split_parent_id_field = fields.Boolean(required=True)
[docs] subject_field = fields.Boolean(required=True)
[docs] template_field = fields.Boolean(required=True)
[docs] text_document_field = fields.Boolean(required=True)
[docs] to_field = fields.Boolean(required=True)
# Webhooks
[docs] available_webhook_set = fields.List(fields.Nested(WebhookSchema), required=True)
[docs] webhook_set = fields.List(fields.Nested(WebhookSchema), required=True)
# Parser and tables fields
[docs] table_set = fields.List(fields.Nested(TableFieldSchema))
[docs] parser_object_set = fields.List(fields.Nested(ParserFieldSchema))