@@ -65,39 +65,42 @@ def create(
65
65
extra_body : Body | None = None ,
66
66
timeout : float | httpx .Timeout | None | NotGiven = NOT_GIVEN ,
67
67
) -> ParseCreateResponse :
68
- """Parse a file into a structured Markdown representation .
68
+ """Parse a file into a structured Markdown and/or JSON .
69
69
70
- The file size must be
71
- less than 100MB and the number of pages must be less than 400.
70
+ Files must be less than
71
+ 100MB and 400 pages. We use LibreOffice to convert DOC(X) and PPT(X) files to
72
+ PDF, which may affect page count.
72
73
74
+ See our [blog post](https://contextual.ai/blog/document-parser-for-rag) and
75
+ [code examples](https://github.com/ContextualAI/examples/blob/main/03-standalone-api/04-parse/parse.ipynb).
73
76
Email [parse-feedback@contextual.ai](mailto:parse-feedback@contextual.ai) with
74
77
any feedback or questions.
75
78
76
79
Args:
77
80
raw_file: The file to be parsed. The file type must be PDF, DOC / DOCX, PPT / PPTX.
78
81
79
- enable_document_hierarchy: Controls parsing heading levels (e.g. H1, H2, H3) at higher quality. Adds a
80
- table of contents to the output with the structure of the entire parsed
81
- document . Not permitted in ' basic' parsing_mode, or if page_range is not
82
- continuous and/or does not start from page zero.
82
+ enable_document_hierarchy: Adds a table of contents to the output with the structure of the entire parsed
83
+ document. This feature is in beta. Controls parsing heading levels (e.g. H1, H2,
84
+ H3) at higher quality . Not permitted in ` basic` parsing_mode, or if page_range
85
+ is not continuous and/or does not start from page zero.
83
86
84
87
enable_split_tables: Controls whether tables are split into multiple tables by row with the headers
85
88
propagated. Use for improving LLM comprehension of very large tables. Not
86
- permitted in ' basic' parsing_mode.
89
+ permitted in ` basic` parsing_mode.
87
90
88
- figure_caption_mode: Controls how thorough figure captions are. ' concise' is short and minimizes
89
- chances of hallucinations. ' detailed' is more thorough and can include
90
- commentary. Not permitted in ' basic' parsing_mode.
91
+ figure_caption_mode: Controls how thorough figure captions are. ` concise` is short and minimizes
92
+ chances of hallucinations. ` detailed` is more thorough and can include
93
+ commentary; this mode is in beta . Not permitted in ` basic` parsing_mode.
91
94
92
95
max_split_table_cells: Threshold number of table cells beyond which large tables are split if
93
- `enable_split_tables` is True. Not permitted in ' basic' parsing_mode.
96
+ `enable_split_tables` is True. Not permitted in ` basic` parsing_mode.
94
97
95
98
page_range: Optional string representing page range to be parsed. Format: comma-separated
96
- indexes (0-based) e.g. ' 0,1,2,5,6' or ranges ( inclusive of both ends) e.g.
97
- ' 0-2,5,6'
99
+ indexes (0-based, e.g. ` 0,1,2,5,6`), or ranges inclusive of both ends ( e.g.
100
+ ` 0-2,5,6`)
98
101
99
- parse_mode: The settings to use for parsing. ' basic' is for simple, text-only documents.
100
- ' standard' is for complex documents with images, complex hierarchy, and/or no
102
+ parse_mode: The settings to use for parsing. ` basic` is for simple, text-only documents.
103
+ ` standard` is for complex documents with images, complex hierarchy, and/or no
101
104
natively encoded textual data (e.g. for scanned documents).
102
105
103
106
extra_headers: Send extra headers
@@ -156,11 +159,11 @@ def job_results(
156
159
job_id: Unique ID of the parse job
157
160
158
161
output_types: The desired output format(s) of the parsed file. Must be `markdown-document`,
159
- `markdown-per-page`, and/or `blocks-per-page`. `markdown-document` parses the
160
- whole document into a single concatenated markdown output. `markdown-per-page`
161
- provides markdown output per page. `blocks-per-page` provides a structured JSON
162
+ `markdown-per-page`, and/or `blocks-per-page`. Specify multiple values to get
163
+ multiple formats in the response. `markdown-document` parses the whole document
164
+ into a single concatenated markdown output. `markdown-per-page` provides
165
+ markdown output per page. `blocks-per-page` provides a structured JSON
162
166
representation of the content blocks on each page, sorted by reading order.
163
- Specify multiple values to get multiple formats in the response.
164
167
165
168
extra_headers: Send extra headers
166
169
@@ -298,39 +301,42 @@ async def create(
298
301
extra_body : Body | None = None ,
299
302
timeout : float | httpx .Timeout | None | NotGiven = NOT_GIVEN ,
300
303
) -> ParseCreateResponse :
301
- """Parse a file into a structured Markdown representation .
304
+ """Parse a file into a structured Markdown and/or JSON .
302
305
303
- The file size must be
304
- less than 100MB and the number of pages must be less than 400.
306
+ Files must be less than
307
+ 100MB and 400 pages. We use LibreOffice to convert DOC(X) and PPT(X) files to
308
+ PDF, which may affect page count.
305
309
310
+ See our [blog post](https://contextual.ai/blog/document-parser-for-rag) and
311
+ [code examples](https://github.com/ContextualAI/examples/blob/main/03-standalone-api/04-parse/parse.ipynb).
306
312
Email [parse-feedback@contextual.ai](mailto:parse-feedback@contextual.ai) with
307
313
any feedback or questions.
308
314
309
315
Args:
310
316
raw_file: The file to be parsed. The file type must be PDF, DOC / DOCX, PPT / PPTX.
311
317
312
- enable_document_hierarchy: Controls parsing heading levels (e.g. H1, H2, H3) at higher quality. Adds a
313
- table of contents to the output with the structure of the entire parsed
314
- document . Not permitted in ' basic' parsing_mode, or if page_range is not
315
- continuous and/or does not start from page zero.
318
+ enable_document_hierarchy: Adds a table of contents to the output with the structure of the entire parsed
319
+ document. This feature is in beta. Controls parsing heading levels (e.g. H1, H2,
320
+ H3) at higher quality . Not permitted in ` basic` parsing_mode, or if page_range
321
+ is not continuous and/or does not start from page zero.
316
322
317
323
enable_split_tables: Controls whether tables are split into multiple tables by row with the headers
318
324
propagated. Use for improving LLM comprehension of very large tables. Not
319
- permitted in ' basic' parsing_mode.
325
+ permitted in ` basic` parsing_mode.
320
326
321
- figure_caption_mode: Controls how thorough figure captions are. ' concise' is short and minimizes
322
- chances of hallucinations. ' detailed' is more thorough and can include
323
- commentary. Not permitted in ' basic' parsing_mode.
327
+ figure_caption_mode: Controls how thorough figure captions are. ` concise` is short and minimizes
328
+ chances of hallucinations. ` detailed` is more thorough and can include
329
+ commentary; this mode is in beta . Not permitted in ` basic` parsing_mode.
324
330
325
331
max_split_table_cells: Threshold number of table cells beyond which large tables are split if
326
- `enable_split_tables` is True. Not permitted in ' basic' parsing_mode.
332
+ `enable_split_tables` is True. Not permitted in ` basic` parsing_mode.
327
333
328
334
page_range: Optional string representing page range to be parsed. Format: comma-separated
329
- indexes (0-based) e.g. ' 0,1,2,5,6' or ranges ( inclusive of both ends) e.g.
330
- ' 0-2,5,6'
335
+ indexes (0-based, e.g. ` 0,1,2,5,6`), or ranges inclusive of both ends ( e.g.
336
+ ` 0-2,5,6`)
331
337
332
- parse_mode: The settings to use for parsing. ' basic' is for simple, text-only documents.
333
- ' standard' is for complex documents with images, complex hierarchy, and/or no
338
+ parse_mode: The settings to use for parsing. ` basic` is for simple, text-only documents.
339
+ ` standard` is for complex documents with images, complex hierarchy, and/or no
334
340
natively encoded textual data (e.g. for scanned documents).
335
341
336
342
extra_headers: Send extra headers
@@ -389,11 +395,11 @@ async def job_results(
389
395
job_id: Unique ID of the parse job
390
396
391
397
output_types: The desired output format(s) of the parsed file. Must be `markdown-document`,
392
- `markdown-per-page`, and/or `blocks-per-page`. `markdown-document` parses the
393
- whole document into a single concatenated markdown output. `markdown-per-page`
394
- provides markdown output per page. `blocks-per-page` provides a structured JSON
398
+ `markdown-per-page`, and/or `blocks-per-page`. Specify multiple values to get
399
+ multiple formats in the response. `markdown-document` parses the whole document
400
+ into a single concatenated markdown output. `markdown-per-page` provides
401
+ markdown output per page. `blocks-per-page` provides a structured JSON
395
402
representation of the content blocks on each page, sorted by reading order.
396
- Specify multiple values to get multiple formats in the response.
397
403
398
404
extra_headers: Send extra headers
399
405
0 commit comments