You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

471 lines
12 KiB

// Package fitz provides wrapper for the [MuPDF](http://mupdf.com/) fitz library
// that can extract pages from PDF and EPUB documents as images, text, html or svg.
package fitz
/*
#include <mupdf/fitz.h>
#include <stdlib.h>
const char *fz_version = FZ_VERSION;
*/
import "C"
import (
"errors"
"image"
"io"
"io/ioutil"
"os"
"path/filepath"
"sync"
"unsafe"
)
// Errors.
var (
ErrNoSuchFile = errors.New("fitz: no such file")
ErrCreateContext = errors.New("fitz: cannot create context")
ErrOpenDocument = errors.New("fitz: cannot open document")
ErrOpenMemory = errors.New("fitz: cannot open memory")
ErrPageMissing = errors.New("fitz: page missing")
ErrCreatePixmap = errors.New("fitz: cannot create pixmap")
ErrPixmapSamples = errors.New("fitz: cannot get pixmap samples")
ErrNeedsPassword = errors.New("fitz: document needs password")
ErrLoadOutline = errors.New("fitz: cannot load outline")
)
// Document represents fitz document.
type Document struct {
ctx *C.struct_fz_context_s
doc *C.struct_fz_document_s
mtx sync.Mutex
}
// Outline type.
type Outline struct {
// Hierarchy level of the entry (starting from 1).
Level int
// Title of outline item.
Title string
// Destination in the document to be displayed when this outline item is activated.
URI string
// The page number of an internal link.
Page int
// Top.
Top float64
}
// New returns new fitz document.
func New(filename string) (f *Document, err error) {
f = &Document{}
filename, err = filepath.Abs(filename)
if err != nil {
return
}
if _, e := os.Stat(filename); e != nil {
err = ErrNoSuchFile
return
}
f.ctx = (*C.struct_fz_context_s)(unsafe.Pointer(C.fz_new_context_imp(nil, nil, C.FZ_STORE_UNLIMITED, C.fz_version)))
if f.ctx == nil {
err = ErrCreateContext
return
}
C.fz_register_document_handlers(f.ctx)
cfilename := C.CString(filename)
defer C.free(unsafe.Pointer(cfilename))
f.doc = C.fz_open_document(f.ctx, cfilename)
if f.doc == nil {
err = ErrOpenDocument
}
ret := C.fz_needs_password(f.ctx, f.doc)
v := bool(int(ret) != 0)
if v {
err = ErrNeedsPassword
}
return
}
// NewFromMemory returns new fitz document from byte slice.
func NewFromMemory(b []byte) (f *Document, err error) {
f = &Document{}
f.ctx = (*C.struct_fz_context_s)(unsafe.Pointer(C.fz_new_context_imp(nil, nil, C.FZ_STORE_UNLIMITED, C.fz_version)))
if f.ctx == nil {
err = ErrCreateContext
return
}
C.fz_register_document_handlers(f.ctx)
data := (*C.uchar)(C.CBytes(b))
stream := C.fz_open_memory(f.ctx, data, C.size_t(len(b)))
if stream == nil {
err = ErrOpenMemory
return
}
cmagic := C.CString(contentType(b))
defer C.free(unsafe.Pointer(cmagic))
f.doc = C.fz_open_document_with_stream(f.ctx, cmagic, stream)
if f.doc == nil {
err = ErrOpenDocument
}
ret := C.fz_needs_password(f.ctx, f.doc)
v := bool(int(ret) != 0)
if v {
err = ErrNeedsPassword
}
return
}
// NewFromReader returns new fitz document from io.Reader.
func NewFromReader(r io.Reader) (f *Document, err error) {
b, e := ioutil.ReadAll(r)
if e != nil {
err = e
return
}
f, err = NewFromMemory(b)
return
}
// NumPage returns total number of pages in document.
func (f *Document) NumPage() int {
return int(C.fz_count_pages(f.ctx, f.doc))
}
// Image returns image for given page number.
func (f *Document) Image(pageNumber int) (image.Image, error) {
return f.ImageDPI(pageNumber, 500.0)
}
// ImageDPI returns image for given page number and DPI.
func (f *Document) ImageDPI(pageNumber int, dpi float64) (image.Image, error) {
f.mtx.Lock()
defer f.mtx.Unlock()
img := image.RGBA{}
if pageNumber >= f.NumPage() {
return nil, ErrPageMissing
}
page := C.fz_load_page(f.ctx, f.doc, C.int(pageNumber))
defer C.fz_drop_page(f.ctx, page)
var bounds C.fz_rect
C.fz_bound_page(f.ctx, page, &bounds)
var ctm C.fz_matrix
C.fz_scale(&ctm, C.float(dpi/72), C.float(dpi/72))
var bbox C.fz_irect
C.fz_transform_rect(&bounds, &ctm)
C.fz_round_rect(&bbox, &bounds)
pixmap := C.fz_new_pixmap_with_bbox(f.ctx, C.fz_device_rgb(f.ctx), &bbox, nil, 1)
if pixmap == nil {
return nil, ErrCreatePixmap
}
C.fz_clear_pixmap_with_value(f.ctx, pixmap, C.int(0xff))
defer C.fz_drop_pixmap(f.ctx, pixmap)
device := C.fz_new_draw_device(f.ctx, &ctm, pixmap)
C.fz_enable_device_hints(f.ctx, device, C.FZ_NO_CACHE)
defer C.fz_drop_device(f.ctx, device)
drawMatrix := C.fz_identity
C.fz_run_page(f.ctx, page, device, &drawMatrix, nil)
C.fz_close_device(f.ctx, device)
pixels := C.fz_pixmap_samples(f.ctx, pixmap)
if pixels == nil {
return nil, ErrPixmapSamples
}
img.Pix = C.GoBytes(unsafe.Pointer(pixels), C.int(4*bbox.x1*bbox.y1))
img.Rect = image.Rect(int(bbox.x0), int(bbox.y0), int(bbox.x1), int(bbox.y1))
img.Stride = 4 * img.Rect.Max.X
return &img, nil
}
// ImagePNG returns image for given page number as PNG bytes.
func (f *Document) ImagePNG(pageNumber int, dpi float64) ([]byte, error) {
f.mtx.Lock()
defer f.mtx.Unlock()
if pageNumber >= f.NumPage() {
return nil, ErrPageMissing
}
page := C.fz_load_page(f.ctx, f.doc, C.int(pageNumber))
defer C.fz_drop_page(f.ctx, page)
var bounds C.fz_rect
C.fz_bound_page(f.ctx, page, &bounds)
var ctm C.fz_matrix
C.fz_scale(&ctm, C.float(dpi/72), C.float(dpi/72))
var bbox C.fz_irect
C.fz_transform_rect(&bounds, &ctm)
C.fz_round_rect(&bbox, &bounds)
pixmap := C.fz_new_pixmap_with_bbox(f.ctx, C.fz_device_rgb(f.ctx), &bbox, nil, 1)
if pixmap == nil {
return nil, ErrCreatePixmap
}
C.fz_clear_pixmap_with_value(f.ctx, pixmap, C.int(0xff))
defer C.fz_drop_pixmap(f.ctx, pixmap)
device := C.fz_new_draw_device(f.ctx, &ctm, pixmap)
C.fz_enable_device_hints(f.ctx, device, C.FZ_NO_CACHE)
defer C.fz_drop_device(f.ctx, device)
drawMatrix := C.fz_identity
C.fz_run_page(f.ctx, page, device, &drawMatrix, nil)
C.fz_close_device(f.ctx, device)
buf := C.fz_new_buffer_from_pixmap_as_png(f.ctx, pixmap, nil)
defer C.fz_drop_buffer(f.ctx, buf)
size := C.fz_buffer_storage(f.ctx, buf, nil)
str := C.GoStringN(C.fz_string_from_buffer(f.ctx, buf), C.int(size))
return []byte(str), nil
}
// Text returns text for given page number.
func (f *Document) Text(pageNumber int) (string, error) {
f.mtx.Lock()
defer f.mtx.Unlock()
if pageNumber >= f.NumPage() {
return "", ErrPageMissing
}
page := C.fz_load_page(f.ctx, f.doc, C.int(pageNumber))
defer C.fz_drop_page(f.ctx, page)
var bounds C.fz_rect
C.fz_bound_page(f.ctx, page, &bounds)
var ctm C.fz_matrix
C.fz_scale(&ctm, C.float(72.0/72), C.float(72.0/72))
text := C.fz_new_stext_page(f.ctx, &bounds)
defer C.fz_drop_stext_page(f.ctx, text)
var opts C.fz_stext_options
opts.flags = 0
device := C.fz_new_stext_device(f.ctx, text, &opts)
C.fz_enable_device_hints(f.ctx, device, C.FZ_NO_CACHE)
defer C.fz_drop_device(f.ctx, device)
var cookie C.fz_cookie
C.fz_run_page(f.ctx, page, device, &ctm, &cookie)
C.fz_close_device(f.ctx, device)
buf := C.fz_new_buffer_from_stext_page(f.ctx, text)
defer C.fz_drop_buffer(f.ctx, buf)
str := C.GoString(C.fz_string_from_buffer(f.ctx, buf))
return str, nil
}
// HTML returns html for given page number.
func (f *Document) HTML(pageNumber int, header bool) (string, error) {
f.mtx.Lock()
defer f.mtx.Unlock()
if pageNumber >= f.NumPage() {
return "", ErrPageMissing
}
page := C.fz_load_page(f.ctx, f.doc, C.int(pageNumber))
defer C.fz_drop_page(f.ctx, page)
var bounds C.fz_rect
C.fz_bound_page(f.ctx, page, &bounds)
var ctm C.fz_matrix
C.fz_scale(&ctm, C.float(72.0/72), C.float(72.0/72))
text := C.fz_new_stext_page(f.ctx, &bounds)
defer C.fz_drop_stext_page(f.ctx, text)
var opts C.fz_stext_options
opts.flags = C.FZ_STEXT_PRESERVE_IMAGES
device := C.fz_new_stext_device(f.ctx, text, &opts)
C.fz_enable_device_hints(f.ctx, device, C.FZ_NO_CACHE)
defer C.fz_drop_device(f.ctx, device)
var cookie C.fz_cookie
C.fz_run_page(f.ctx, page, device, &ctm, &cookie)
C.fz_close_device(f.ctx, device)
buf := C.fz_new_buffer(f.ctx, 1024)
defer C.fz_drop_buffer(f.ctx, buf)
out := C.fz_new_output_with_buffer(f.ctx, buf)
defer C.fz_drop_output(f.ctx, out)
if header {
C.fz_print_stext_header_as_html(f.ctx, out)
}
C.fz_print_stext_page_as_html(f.ctx, out, text)
if header {
C.fz_print_stext_trailer_as_html(f.ctx, out)
}
str := C.GoString(C.fz_string_from_buffer(f.ctx, buf))
return str, nil
}
// SVG returns svg document for given page number.
func (f *Document) SVG(pageNumber int) (string, error) {
f.mtx.Lock()
defer f.mtx.Unlock()
if pageNumber >= f.NumPage() {
return "", ErrPageMissing
}
page := C.fz_load_page(f.ctx, f.doc, C.int(pageNumber))
defer C.fz_drop_page(f.ctx, page)
var bounds C.fz_rect
C.fz_bound_page(f.ctx, page, &bounds)
var ctm C.fz_matrix
C.fz_scale(&ctm, C.float(72.0/72), C.float(72.0/72))
C.fz_transform_rect(&bounds, &ctm)
buf := C.fz_new_buffer(f.ctx, 1024)
defer C.fz_drop_buffer(f.ctx, buf)
out := C.fz_new_output_with_buffer(f.ctx, buf)
defer C.fz_drop_output(f.ctx, out)
device := C.fz_new_svg_device(f.ctx, out, bounds.x1-bounds.x0, bounds.y1-bounds.y0, C.FZ_SVG_TEXT_AS_PATH, 1)
C.fz_enable_device_hints(f.ctx, device, C.FZ_NO_CACHE)
defer C.fz_drop_device(f.ctx, device)
var cookie C.fz_cookie
C.fz_run_page(f.ctx, page, device, &ctm, &cookie)
C.fz_close_device(f.ctx, device)
str := C.GoString(C.fz_string_from_buffer(f.ctx, buf))
return str, nil
}
// ToC returns the table of contents (also known as outline).
func (f *Document) ToC() ([]Outline, error) {
data := make([]Outline, 0)
outline := C.fz_load_outline(f.ctx, f.doc)
if outline == nil {
return nil, ErrLoadOutline
}
defer C.fz_drop_outline(f.ctx, outline)
var walk func(outline *C.fz_outline, level int)
walk = func(outline *C.fz_outline, level int) {
for outline != nil {
res := Outline{}
res.Level = level
res.Title = C.GoString(outline.title)
res.URI = C.GoString(outline.uri)
res.Page = int(outline.page)
res.Top = float64(outline.y)
data = append(data, res)
if outline.down != nil {
walk(outline.down, level+1)
}
outline = outline.next
}
}
walk(outline, 1)
return data, nil
}
// Metadata returns the map with standard metadata.
func (f *Document) Metadata() map[string]string {
data := make(map[string]string)
lookup := func(key string) string {
ckey := C.CString(key)
defer C.free(unsafe.Pointer(ckey))
buf := make([]byte, 256)
C.fz_lookup_metadata(f.ctx, f.doc, ckey, (*C.char)(unsafe.Pointer(&buf[0])), C.int(len(buf)))
return string(buf)
}
data["format"] = lookup("format")
data["encryption"] = lookup("encryption")
data["title"] = lookup("info:Title")
data["author"] = lookup("info:Author")
data["subject"] = lookup("info:Subject")
data["keywords"] = lookup("info:Keywords")
data["creator"] = lookup("info:Creator")
data["producer"] = lookup("info:Producer")
data["creationDate"] = lookup("info:CreationDate")
data["modDate"] = lookup("info:modDate")
return data
}
// Close closes the underlying fitz document.
func (f *Document) Close() error {
C.fz_drop_document(f.ctx, f.doc)
C.fz_drop_context(f.ctx)
return nil
}
// contentType returns document MIME type.
func contentType(b []byte) string {
var mtype string
if len(b) > 3 && b[0] == 0x25 && b[1] == 0x50 && b[2] == 0x44 && b[3] == 0x46 {
mtype = "application/pdf"
} else if len(b) > 57 && b[0] == 0x50 && b[1] == 0x4B && b[2] == 0x3 && b[3] == 0x4 && b[30] == 0x6D && b[31] == 0x69 && b[32] == 0x6D && b[33] == 0x65 &&
b[34] == 0x74 && b[35] == 0x79 && b[36] == 0x70 && b[37] == 0x65 && b[38] == 0x61 && b[39] == 0x70 && b[40] == 0x70 && b[41] == 0x6C &&
b[42] == 0x69 && b[43] == 0x63 && b[44] == 0x61 && b[45] == 0x74 && b[46] == 0x69 && b[47] == 0x6F && b[48] == 0x6E && b[49] == 0x2F &&
b[50] == 0x65 && b[51] == 0x70 && b[52] == 0x75 && b[53] == 0x62 && b[54] == 0x2B && b[55] == 0x7A && b[56] == 0x69 && b[57] == 0x70 {
mtype = "application/epub+zip"
}
return mtype
}