You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
198 lines
5.8 KiB
198 lines
5.8 KiB
#ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
|
|
#define MUPDF_FITZ_STRUCTURED_TEXT_H
|
|
|
|
#include "mupdf/fitz/system.h"
|
|
#include "mupdf/fitz/context.h"
|
|
#include "mupdf/fitz/geometry.h"
|
|
#include "mupdf/fitz/font.h"
|
|
#include "mupdf/fitz/colorspace.h"
|
|
#include "mupdf/fitz/image.h"
|
|
#include "mupdf/fitz/output.h"
|
|
#include "mupdf/fitz/device.h"
|
|
|
|
/*
|
|
Text extraction device: Used for searching, format conversion etc.
|
|
|
|
(In development - Subject to change in future versions)
|
|
*/
|
|
|
|
typedef struct fz_stext_char_s fz_stext_char;
|
|
typedef struct fz_stext_line_s fz_stext_line;
|
|
typedef struct fz_stext_block_s fz_stext_block;
|
|
typedef struct fz_stext_page_s fz_stext_page;
|
|
|
|
/*
|
|
FZ_STEXT_PRESERVE_LIGATURES: If this option is activated ligatures
|
|
are passed through to the application in their original form. If
|
|
this option is deactivated ligatures are expanded into their
|
|
constituent parts, e.g. the ligature ffi is expanded into three
|
|
separate characters f, f and i.
|
|
|
|
FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated whitespace
|
|
is passed through to the application in its original form. If this
|
|
option is deactivated any type of horizontal whitespace (including
|
|
horizontal tabs) will be replaced with space characters of variable
|
|
width.
|
|
|
|
FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images will
|
|
be stored in the structured text structure. The default is to ignore
|
|
all images.
|
|
*/
|
|
enum
|
|
{
|
|
FZ_STEXT_PRESERVE_LIGATURES = 1,
|
|
FZ_STEXT_PRESERVE_WHITESPACE = 2,
|
|
FZ_STEXT_PRESERVE_IMAGES = 4,
|
|
};
|
|
|
|
/*
|
|
A text page is a list of blocks, together with an overall bounding box.
|
|
*/
|
|
struct fz_stext_page_s
|
|
{
|
|
fz_pool *pool;
|
|
fz_rect mediabox;
|
|
fz_stext_block *first_block, *last_block;
|
|
};
|
|
|
|
enum
|
|
{
|
|
FZ_STEXT_BLOCK_TEXT = 0,
|
|
FZ_STEXT_BLOCK_IMAGE = 1
|
|
};
|
|
|
|
/*
|
|
A text block is a list of lines of text (typically a paragraph), or an image.
|
|
*/
|
|
struct fz_stext_block_s
|
|
{
|
|
int type;
|
|
fz_rect bbox;
|
|
union {
|
|
struct { fz_stext_line *first_line, *last_line; } t;
|
|
struct { fz_matrix transform; fz_image *image; } i;
|
|
} u;
|
|
fz_stext_block *prev, *next;
|
|
};
|
|
|
|
/*
|
|
A text line is a list of characters that share a common baseline.
|
|
*/
|
|
struct fz_stext_line_s
|
|
{
|
|
int wmode; /* 0 for horizontal, 1 for vertical */
|
|
fz_point dir; /* normalized direction of baseline */
|
|
fz_rect bbox;
|
|
fz_stext_char *first_char, *last_char;
|
|
fz_stext_line *prev, *next;
|
|
};
|
|
|
|
/*
|
|
A text char is a unicode character, the style in which is appears, and
|
|
the point at which it is positioned.
|
|
*/
|
|
struct fz_stext_char_s
|
|
{
|
|
int c;
|
|
fz_point origin;
|
|
fz_rect bbox;
|
|
float size;
|
|
fz_font *font;
|
|
fz_stext_char *next;
|
|
};
|
|
|
|
extern const char *fz_stext_options_usage;
|
|
|
|
int fz_stext_char_count(fz_context *ctx, fz_stext_page *page);
|
|
const fz_stext_char *fz_stext_char_at(fz_context *ctx, fz_stext_page *page, int idx);
|
|
|
|
/*
|
|
fz_new_stext_page: Create an empty text page.
|
|
|
|
The text page is filled out by the text device to contain the blocks
|
|
and lines of text on the page.
|
|
|
|
mediabox: optional mediabox information.
|
|
*/
|
|
fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox);
|
|
void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
|
|
|
|
/*
|
|
fz_print_stext_page_as_html: Output a page to a file in HTML (visual) format.
|
|
*/
|
|
void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page);
|
|
void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out);
|
|
void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out);
|
|
|
|
/*
|
|
fz_print_stext_page_as_xhtml: Output a page to a file in XHTML (semantic) format.
|
|
*/
|
|
void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page);
|
|
void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out);
|
|
void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out);
|
|
|
|
/*
|
|
fz_print_stext_page_as_xml: Output a page to a file in XML format.
|
|
*/
|
|
void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page);
|
|
|
|
/*
|
|
fz_print_stext_page_as_text: Output a page to a file in UTF-8 format.
|
|
*/
|
|
void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page);
|
|
|
|
/*
|
|
fz_search_stext_page: Search for occurrence of 'needle' in text page.
|
|
|
|
Return the number of hits and store hit bboxes in the passed in array.
|
|
|
|
NOTE: This is an experimental interface and subject to change without notice.
|
|
*/
|
|
int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_rect *hit_bbox, int hit_max);
|
|
|
|
/*
|
|
fz_highlight_selection: Return a list of rectangles to highlight lines inside the selection points.
|
|
*/
|
|
int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_rect *hit_bbox, int hit_max);
|
|
|
|
/*
|
|
fz_copy_selection: Return a newly allocated UTF-8 string with the text for a given selection.
|
|
|
|
crlf: If true, write "\r\n" style line endings (otherwise "\n" only).
|
|
*/
|
|
char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf);
|
|
|
|
/*
|
|
struct fz_stext_options: Options for creating a pixmap and draw device.
|
|
*/
|
|
typedef struct fz_stext_options_s fz_stext_options;
|
|
|
|
struct fz_stext_options_s
|
|
{
|
|
int flags;
|
|
};
|
|
|
|
/*
|
|
fz_parse_stext_options: Parse stext device options from a comma separated key-value string.
|
|
*/
|
|
fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
|
|
|
|
/*
|
|
fz_new_stext_device: Create a device to extract the text on a page.
|
|
|
|
Gather the text on a page into blocks and lines.
|
|
|
|
The reading order is taken from the order the text is drawn in the
|
|
source file, so may not be accurate.
|
|
|
|
page: The text page to which content should be added. This will
|
|
usually be a newly created (empty) text page, but it can be one
|
|
containing data already (for example when merging multiple pages,
|
|
or watermarking).
|
|
|
|
options: Options to configure the stext device.
|
|
*/
|
|
fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options);
|
|
|
|
#endif
|