-
Notifications
You must be signed in to change notification settings - Fork 27
/
makepdf
executable file
·130 lines (118 loc) · 6.23 KB
/
makepdf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash
# makepdf, makes a pdf from an image series
VERSION="1.0"
SCRIPTDIR="$(dirname "$(which "${0}")")"
. "${SCRIPTDIR}/mmfunctions" || { echo "Missing '${SCRIPTDIR}/mmfunctions'. Exiting." ; exit 1 ;};
DEPENDENCIES=(ffmpeg pdfjoin tesseract)
_initialize_make
_usage(){
echo
echo "$(basename "${0}") ${VERSION}"
echo "This application will create a small pdf file (suitable for sharing online) from a collection of image file or package input with the following options."
echo "Dependencies: ${DEPENDENCIES[@]}"
echo "Usage: $(basename "${0}") [ -o /path/to/deliver/to/ ] fileorpackage1 [ fileorpackage2 ...]"
echo " -o directory ( directory to write the resulting file to )"
echo " -e emailaddress ( send an email about the delivery, only valid if -d is used )"
echo " -E emailaddress ( send an email about process outcome )"
echo " -h ( display this help )"
echo
exit
}
[ "${#}" = 0 ] && _usage
# command-line options to set mediaid and original variables
OPTIND=1
while getopts ":o:e:E:h" OPT ; do
case "${OPT}" in
o) OUTPUTDIR_FORCED="${OPTARG}" && _check_outputdir_forced ;;
e) EMAILADDRESS_DELIVERY="${OPTARG}" ;;
E) EMAILADDRESS_OUTCOME="${OPTARG}" ;;
h) _usage ;;
:) echo "Option -${OPTARG} requires an argument" ; _writeerrorlog "makepdf" "The option selected required an argument and none was provided. The script had to exit." ; exit 1 ;;
*) echo "bad option -${OPTARG}" ; _usage ;;
esac
done
shift $(( ${OPTIND} - 1 ))
[ "${EMAILADDRESS_OUTCOME}" ] && check_emailaddress "${EMAILADDRESS_OUTCOME}"
[ "${EMAILADDRESS_DELIVERY}" ] && check_emailaddress "${EMAILADDRESS_DELIVERY}"
while [ "${*}" != "" ] ; do
# get context about the input
INPUT="${1}"
shift
if [ -z "${OUTPUTDIR_FORCED}" ] ; then
[ -d "${INPUT}" ] && { OUTPUTDIR="${INPUT}/objects/access/pdf_1" && LOGDIR="${INPUT}/metadata/logs" ;};
[ -f "${INPUT}" ] && { OUTPUTDIR=$(dirname "${INPUT}")"/access/pdf_1" && LOGDIR="$(dirname "${INPUT}")/access/logs" ;};
[ ! "${OUTPUTDIR}" ] && { OUTPUTDIR="${INPUT}/objects/access/pdf_1" && LOGDIR="${INPUT}/metadata/logs" ;};
else
OUTPUTDIR="${OUTPUTDIR_FORCED}"
LOGDIR="${OUTPUTDIR}/logs"
fi
OUTPUTDIRTEXT="${INPUT}/objects/access/txt_1"
_run mkdir -p "${LOGDIR}"
exec > >(tee "${LOGDIR}/$(basename "${0}")_$(_get_iso8601_c)_$(basename "${0}")_${VERSION}.txt")
exec 2>&1
#_find_input "${INPUT}"
if [ -d "${INPUT}/objects" ] ; then
SOURCEDIR="${INPUT}/objects"
else
_report -wt "No source objects folder"
_writeerrorlog "makepdf" "No source objects folder provided. The script could not continue."
exit 1
fi
MEDIAID=$(basename "${INPUT}" | cut -d. -f1)
# set up output
_log -b
OUTPUT="${OUTPUTDIR}/${MEDIAID%.*}.pdf"
[ -s "${OUTPUT}" ] && { _report -wt "WARNING ${OUTPUT} already exists, skipping process" ; shift ; continue ;};
_run mkdir -p "${OUTPUTDIR}"
TMP_MAKEPDF_DIR="${INPUT}/tmp"
TMP_JPG_DIR="${TMP_MAKEPDF_DIR}/jpgs"
_run mkdir -p "${TMP_MAKEPDF_DIR}" "${TMP_JPG_DIR}" "${OUTPUTDIRTEXT}"
for TIF in $(find "${SOURCEDIR}" -maxdepth 1 -mindepth 1 -iname "*.tif" -type f | sort) ; do
tifname="$(basename "${TIF}")"
_report -dt "Working on ${tifname}..."
pageno="$(echo "${tifname}" | cut -d_ -f2 | cut -d. -f1)"
# check that tif name adheres to filename pattern and rename if not
if [[ "${#pageno}" != "3" ]] ; then
tifdir="$(dirname "${TIF}")"
basename="$(echo "${tifname}" | cut -d_ -f1 | cut -d. -f2)"
EXTENSION="${tifname##*.}"
pageno="$(echo "0000${pageno}" | tail -c 4 | head -c 3)"
_report -dt "Renaming to fit image sequence naming pattern."
NEW_TIF_NAME="${tifdir}/${basename}_${pageno}.${EXTENSION}"
mv -v -n "${TIF}" "${NEW_TIF_NAME}"
TIF="${NEW_TIF_NAME}"
fi
TIF_BASE_NAME="$(basename "${TIF%.*}")"
JPG_NAME="${TMP_JPG_DIR}/${TIF_BASE_NAME}.jpg"
if [[ ! -s "${JPG_NAME}" ]] ; then
ffmpeg -hide_banner -nostdin -i "${TIF}" -pix_fmt yuvj420p "${JPG_NAME}"
fi
TESSERACT_CONFIG=(-c tessedit_char_whitelist="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^\&*(){}[]\|\"':;?/>.<,~\` " -c textord_min_linesize=2.25 -c preserve_interword_spaces=1)
tesseract "${JPG_NAME}" "${TMP_JPG_DIR}/${TIF_BASE_NAME}" -l eng --psm 4 "${TESSERACT_CONFIG[@]}" pdf
tesseract "${JPG_NAME}" "${TMP_JPG_DIR}/${TIF_BASE_NAME}" -l eng --psm 4 "${TESSERACT_CONFIG[@]}" txt
done
_report -dt "Checking for PBCore data"
SCRIPT_TITLE=$(fmpbcore "${MEDIAID}" | xmlstarlet 'select' -N "p=http://www.pbcore.org/PBCore/PBCoreNamespace.html" -t -v "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreTitle[@titleType='Series']" -o ": " -v "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreTitle[@titleType='Episode']")
if [[ -n "${SCRIPT_TITLE}" ]] ; then
MIDDLE_OPTIONS+=(--pdftitle "${SCRIPT_TITLE}")
fi
SCRIPT_AUTHOR=$(fmpbcore "${MEDIAID}" | xmlstarlet 'select' -N "p=http://www.pbcore.org/PBCore/PBCoreNamespace.html" -t -m "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreCreator" -v "p:creatorRole" -o ": " -v "p:creator" -o " ; ")
if [[ -n "${SCRIPT_AUTHOR}" ]] ; then
MIDDLE_OPTIONS+=(--pdfauthor "${SCRIPT_AUTHOR}")
fi
pdfjoin "${MIDDLE_OPTIONS[@]}" --pdfkeywords "${MEDIAID}" --fitpaper 'false' --rotateoversize 'false' --paper letter $(find "${TMP_JPG_DIR}" -name "*.pdf" | sort | xargs) --outfile "${OUTPUT}"
find "${TMP_JPG_DIR}" -name "*.txt" | sort | while read page ; do
PAGENAME="$(basename "${page%.*}")"
echo "======================================" >> "${OUTPUTDIRTEXT}/${MEDIAID}.txt"
echo "Page: ${PAGENAME}" >> "${OUTPUTDIRTEXT}/${MEDIAID}.txt"
echo "======================================" >> "${OUTPUTDIRTEXT}/${MEDIAID}.txt"
echo >> "${OUTPUTDIRTEXT}/${MEDIAID}.txt"
cat "${page}" >> "${OUTPUTDIRTEXT}/${MEDIAID}.txt"
done
#if [ -d "${TMP_MAKEPDF_DIR}" ] ; then
# _run rm -rvf "${TMP_MAKEPDF_DIR}"
#fi
_summarize_make
_deliver_output "${MAKEYOUTUBE_DELIVERY_EMAIL_TO}"
_log -e
done