#!/bin/sh
#
#   pgm2txt - call gocr to convert pgm images into ASCII text
#             (This file is part of subtitle2pgm)
#
#   Copyright (C) 2002-2004 Arne Driescher
#                 2015 Joachim Wiedorn <joodevel@joonet.de>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 2 of the License, or
#   (at your option) any later version.
#
#   This package is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# -------------------------------------------------------------------------

# enable this for debugging
#set -x

VERSION=0.5.2

# Exit script on Control-C (signal 2)
trap 'echo "Control-C pressed."; exit 1;' 2


###### Configuration section #################

# If you already have a GOCR database adjust.
# If in doubt, keep the default
DB_PATH=./db/

# Adjust this path to where the files
# gocrfilter_en.sed  gocrfilter_fr.sed  gocrfilter_none.sed
# are located
PATH_TO_LANGUAGE_FILTER=/usr/local/share/subtitleripper

# set your options for gocr. Please read the gocr
# docs to find out more about this

# GOCR options for pure data base mode
GOCR_OPTIONS_DBONLY="-d 0 -m 130 -m 4 -m 256 -m 32"

# GOCR options for with automatic char recognition
GOCR_OPTIONS_AUTO="-m 130"

# Select the language of the default filter.
# use -f command line option to override this
# valid are: none, en, fr, de, nl
LANGUAGE=none

# Choose your favorite image viewer if you want to see
# the current pgm image while gocr is running.
# Enable the viewer with -v command line option.
IMAGE_VIEWER=display
IMAGE_VIEWER_OPTIONS=

DISPLAY_PGM=false

###### End of configuration section #########


# function to print usage information
usage()
{
cat << _END_
Usage:
    pgm2txt [-v] [-d] [-s num] [-f lang] base_name
    pgm2txt [-h]
    pgm2txt [-V]

    base_name       Are the common first letters of your subtitle
                    pgm files. E.g. "my_movie" if all your pgm
                    files are matched by "my_movie*.pgm".

    -d              Use GOCR options for "database only" mode.

    -f lang         You can optionally specify a language filter
                    using this option. Currently English, French
                    and German are supported.
                    lang = {de|en|fr|none}         Default: none

    -h              Print this usage help.

    -s num          Set spacewidth between words in units of dots
                    which will be used by GOCR. Default: 0 for
                    autodetection (details see: man gocr).

    -v              View the pgm-file while GOCR is converting.

    -V              Print version number.

Example:
    Convert PGM files with english language filter and view the
    PGM while GOCR is converting:

        pgm2txt -v -f en my_movie

Version:
    Part of subtitleripper ${VERSION} package.

_END_
exit 1
}

# if no argument is given display usage information
if [ $# -eq 0 ]; then
    usage
fi

# set default GOCR options to auto
GOCR_OPTIONS=${GOCR_OPTIONS_AUTO}

# process command line options
while getopts "df:hs:vV" OPTION
do
  case $OPTION in
    d)
        GOCR_OPTIONS=${GOCR_OPTIONS_DBONLY}
        ;;
    f)
        # language filter
        LENGTH=$(echo -n "${OPTARG}" | wc -m)
        if [ ${LENGTH} -gt 0 ] && [ ${LENGTH} -lt 4 ]; then
            LANGUAGE=${OPTARG}
        fi
        ;;
    s)
        # minimum wordspace
        if [ ${OPTARG} -gt 0 ] && [ ${OPTARG} -lt 40 ]; then
            GOCR_SPACE="-s ${OPTARG}"
        fi
        ;;
    v)
        DISPLAY_PGM=true
        ;;
    V)
        # print version number
        echo ${VERSION}
        exit 1
        ;;
    *)
        # print usage help (also with -h)
        usage
        ;;
  esac
done

# The first argument that is not
# an option is the pgm file basename
shift $(($OPTIND - 1))
PGM_BASE_NAME=$1

# Add all options
GOCR_ALL_OPTIONS="${GOCR_SPACE} ${GOCR_OPTIONS}"

FILTER_SCRIPT=${PATH_TO_LANGUAGE_FILTER}/gocrfilter_${LANGUAGE}.sed
if [ -f ${FILTER_SCRIPT} ]; then
    echo "Using ${FILTER_SCRIPT} to filter gocr output"
else
    echo "    ------------------------------------------------"
    echo "    No filter file for language >${LANGUAGE}< found!"
    echo "     Please set PATH_TO_LANGUAGE_FILTER in pgm2txt  "
    echo "    and make sure you have choosen a valid language!"
    echo "               No spell checking activated!         "
    echo "    ------------------------------------------------"
    LANGUAGE="none"
    FILTER_SCRIPT=""
fi


# Check if gocr is in the search path
GOCR_TEST_PATH=`which gocr`
if [ ! -x "${GOCR_TEST_PATH}" -a ! -L "${GOCR_TEST_PATH}" ]; then
    echo "  ------------------------------------------------ "
    echo "                 Cannot find gocr !                 "
    echo "    Please make sure you have installed gocr and   "
    echo "            add it to your search path.            "
    echo "  ------------------------------------------------ "
    exit 1
fi

# create a local db file if it does'n exist
if [ ! -d ${DB_PATH} ]; then
    echo creating directory ${DB_PATH}
    mkdir ${DB_PATH}
fi

if [ ! -f ${DB_PATH}/db.lst ]; then
    echo creating empty file ${DB_PATH}/db.lst
    touch ${DB_PATH}/db.lst
fi

# run gocr on all pgm files
for i in ${PGM_BASE_NAME}*.pgm ${PGM_BASE_NAME}*.pgm.gz; do
    if [ ! -f $i ]; then
        echo "File $i not found"
        continue
    fi

    echo "Converting $i into text"
    if [ ! "$DISPLAY_PGM" = "false" ]; then
        ${IMAGE_VIEWER} ${IMAGE_VIEWER_OPTIONS} $i &
    fi
    if [ "none" = "${LANGUAGE}" ]; then
        gocr ${GOCR_ALL_OPTIONS} -p ${DB_PATH}  $i > $i.txt
    else
        gocr ${GOCR_ALL_OPTIONS} -p ${DB_PATH}  $i | sed -f ${FILTER_SCRIPT} -  > $i.txt
    fi

    # close the viewer
    if [ ! "$DISPLAY_PGM" = "false" ]; then
        killall ${IMAGE_VIEWER}
    fi

done
