Mac OS X Technologies

Splitting a PDF using Python, naming the output based on a string with the PDF

I use this script daily to split multi-page PDFs into separate pages, naming them with the value of a text string within the PDF.

Legendary member Hiroto wrote this for me and I tweak it from time to time to adapt it to new requirements, however I am stuck.

The current script finds the text string beginning with BX_ in this line:

m = re.search(r'BX_\\S*', page.string())

and uses the variable "name" as the new filename.

I am trying write the new PDF with the variable "m" appended to the original filename.

So the file: testfile.pdf containing the string BX_123

would write an output file named: testfile_BX123.pdf

I don't know how to grab the input filename.

— # start testing

set inputs to choose file with prompt ("Choose PDF Files.") of type {"com.adobe.pdf"} with multiple selections allowed

set outputFolder to choose folder with prompt ("Choose Destination Folder.")

repeat with i in inputs

set i's contents to i's POSIX path

end repeat

set outputFolder to outputFolder's POSIX path

set params to {}

main(inputs, outputFolder, params)

— # end testing

(*

    for Esko Automation Engine Script Runner

*)

on main(inputs, outputFolder, params)

(*

        list inputs : list of POSIX path of input files

        string outputFolder : POSIX path of output folder

        list params : optional parameters as list of strings

        return string : "OK" | "Warning" | "Error"

       

        * to be invoked by Esko Automation Engine

        cf.

        https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunn er.pdf

    *)

script o

property aa : {outputFolder} & inputs

set args to ""

repeat with a in my aa

set args to args & a's quoted form & space

end repeat

try

do shell script "/usr/bin/python <<'EOF' – " & args & "

# coding: utf-8

import sys, re

from Foundation import NSURL

from Quartz.PDFKit import PDFDocument

argv = [ a.decode('utf-8') for a in sys.argv[1:] ]

outdir = argv.pop(0).rstrip('/')

ret = 0

for f in [ a for a in argv if re.search(r'\\.pdf$', a, re.I) ]:

    url = NSURL.fileURLWithPath_(f)

    doc = PDFDocument.alloc().initWithURL_(url)

    path = doc.documentURL().path()

    pcnt = doc.pageCount()

    for i in range(0, pcnt):

        page = doc.pageAtIndex_(i)

        m = re.search(r'BX_\\S*', page.string())

        if not m:

            ret = max(1, ret)

            print 'no matching string in page %d of %s' % (i + 1, path.encode('utf-8'))

            continue    # ignore this page

        name = m.group()

        doc1 = PDFDocument.alloc().initWithData_(page.dataRepresentation()) # doc for this page

        if not doc1.writeToFile_('%s/%s.pdf' % (outdir,name)):

            ret = max(2, ret)

            print 'failed to save page %d of %s' % (i + 1, path.encode('utf-8'))

sys.exit(ret)

EOF"

set {r, err} to {result, 0}

on error errs number errn

set {r, err} to {errs, errn}

end try

if err = 0 then

return "OK"

else if err = 1 then

log r

return "Warning"

else

log r

return "Error"

end if

end script

tell o to run

end main

–END OF APPLESCRIPT

Click to rate this post!
[Total: 0 Average: 0]

Related posts

Applescript to launch X11 GV or ghostview application

Bobby

script to remove characters in filename

Undershmidtther

OS X Prior to 10.11 Will Not Backup to Time Machine?

Coolio

Leave a Comment