You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

187 lines
3.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

package main
import (
"bytes"
"fmt"
"image/jpeg"
"os"
"path/filepath"
"time"
"github.com/gin-gonic/gin"
// "github.com/karmdip-mi/go-fitz"
"test/go-fitz"
"github.com/ledongthuc/pdf"
"github.com/nfnt/resize"
)
func main() {
r := gin.Default()
r.GET("/parsingPdf", func(c *gin.Context) {
pdfFilePath := c.Query("pdfFilePath")
outPath := c.Query("outPath")
imgPaths, _ := parsingPDF(pdfFilePath, outPath)
c.JSON(200, gin.H{
"imgPaths": imgPaths,
})
})
r.Run() // listen and serve on 0.0.0.0:8080
}
// 解析pdf
func parsingPDF(pdfFilePath string, outPath string) ([]string, string) {
startTime := time.Now().Unix()
endTime := time.Now().Unix()
fmt.Println(startTime)
var imgPaths []string
doc, err := fitz.New(pdfFilePath)
if err != nil {
panic(err)
}
// Extract pages as images
for n := 0; n < doc.NumPage(); n++ {
// img, err := doc.Image(n)
// Image方法导出文件会比较模糊
// ImageDPI导出文件时如果dpi设置太大会导致程序直接奔溃
img, err := doc.ImageDPI(n, 200)
if err != nil {
panic(err)
}
err = os.MkdirAll(outPath, 0755)
if err != nil {
panic(err)
}
imgPath := filepath.Join(outPath, fmt.Sprintf("image-%05d.jpg", n))
imgPaths = append(imgPaths, imgPath)
f, err := os.Create(imgPath)
if err != nil {
panic(err)
}
// 修改图片的大小
m := resize.Resize(0, 2500, img, resize.Lanczos3)
// buf := bytes.Buffer{}
err = jpeg.Encode(f, m, &jpeg.Options{Quality: jpeg.DefaultQuality})
if err != nil {
panic(err)
}
f.Close()
endTime = time.Now().Unix()
fmt.Println(endTime)
fmt.Println((endTime - startTime))
}
var pdfText string
// pdfText, readErr := ReadPdf(pdfFilePath)
// if readErr!=nil{
// if err != nil {
// panic(err)
// }
// }
return imgPaths, pdfText
}
// ReadPdf 获取pdf文字内容
func ReadPdf(path string) (string, error) {
f, r, err := pdf.Open(path)
// remember close file
defer f.Close()
if err != nil {
return "", err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return "", err
}
buf.ReadFrom(b)
return buf.String(), nil
}
// 阅读按行分组的文本
func ReadPdfGroup(path string) (string, error) {
f, r, err := pdf.Open(path)
defer func() {
_ = f.Close()
}()
if err != nil {
return "", err
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
rows, _ := p.GetTextByRow()
for _, row := range rows {
println(">>>> row: ", row.Position)
for _, word := range row.Content {
fmt.Println(word.S)
}
}
}
return "", nil
}
// func readPdf(path string) (string, error) {
// f, r, err := pdf.Open(path)
// // remember close file
// defer f.Close()
// if err != nil {
// return "", err
// }
// var buf bytes.Buffer
// b, err := r.GetPlainText()
// if err != nil {
// return "", err
// }
// buf.ReadFrom(b)
// return buf.String(), nil
// }
// PDF格式的所有文本
// func readPdfFormatAll(path string) (string, error) {
// f, r, err := pdf.Open(path)
// // remember close file
// defer f.Close()
// if err != nil {
// return "", err
// }
// totalPage := r.NumPage()
// for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
// p := r.Page(pageIndex)
// if p.V.IsNull() {
// continue
// }
// var lastTextStyle pdf.Text
// texts := p.Content().Text
// for _, text := range texts {
// if isSameSentence(text, lastTextStyle) {
// lastTextStyle.S = lastTextStyle.S + text.S
// } else {
// fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
// lastTextStyle = text
// }
// }
// }
// return "", nil
// }