aboutsummaryrefslogtreecommitdiff
path: root/nixpkgs/pkgs/applications/misc/k2pdfopt/tesseract.patch
diff options
context:
space:
mode:
Diffstat (limited to 'nixpkgs/pkgs/applications/misc/k2pdfopt/tesseract.patch')
-rw-r--r--nixpkgs/pkgs/applications/misc/k2pdfopt/tesseract.patch675
1 files changed, 0 insertions, 675 deletions
diff --git a/nixpkgs/pkgs/applications/misc/k2pdfopt/tesseract.patch b/nixpkgs/pkgs/applications/misc/k2pdfopt/tesseract.patch
deleted file mode 100644
index adfee9ae282..00000000000
--- a/nixpkgs/pkgs/applications/misc/k2pdfopt/tesseract.patch
+++ /dev/null
@@ -1,675 +0,0 @@
-From 39aa8502eee7bb669a29d1a9b3bfe5c9595ad960 Mon Sep 17 00:00:00 2001
-From: Daniel Fullmer <danielrf12@gmail.com>
-Date: Fri, 13 Sep 2019 13:45:05 -0400
-Subject: [PATCH] Willus mod changes from k2pdfopt
-
----
- src/api/Makefile.am | 1 +
- src/api/baseapi.cpp | 87 +++++++++++
- src/api/baseapi.h | 3 +
- src/api/tesscapi.cpp | 311 +++++++++++++++++++++++++++++++++++++
- src/api/tesseract.h | 29 ++++
- src/ccmain/tessedit.cpp | 5 +-
- src/ccutil/ccutil.h | 7 +
- src/ccutil/genericvector.h | 21 ++-
- src/ccutil/mainblk.cpp | 17 +-
- src/ccutil/params.cpp | 3 +-
- src/ccutil/serialis.cpp | 3 +
- src/ccutil/serialis.h | 2 +
- src/lstm/input.cpp | 3 +
- 13 files changed, 488 insertions(+), 4 deletions(-)
- create mode 100644 src/api/tesscapi.cpp
- create mode 100644 src/api/tesseract.h
-
-diff --git a/src/api/Makefile.am b/src/api/Makefile.am
-index d9b76eb6..cd2dc30f 100644
---- a/src/api/Makefile.am
-+++ b/src/api/Makefile.am
-@@ -39,6 +39,7 @@ libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
- libtesseract_api_la_SOURCES += pdfrenderer.cpp
- libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp
- libtesseract_api_la_SOURCES += renderer.cpp
-+libtesseract_api_la_SOURCES += tesscapi.cpp
-
- lib_LTLIBRARIES += libtesseract.la
- libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
-diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
-index 9245d07c..ea964ee6 100644
---- a/src/api/baseapi.cpp
-+++ b/src/api/baseapi.cpp
-@@ -215,6 +215,14 @@ TessBaseAPI::TessBaseAPI()
- // Use the current locale if building debug code.
- std::locale::global(std::locale(""));
- #endif
-+ const char *locale;
-+ locale = std::setlocale(LC_ALL, nullptr);
-+/* willus mod Remove assertions--taken care of in tesscapi.cpp */
-+// ASSERT_HOST(!strcmp(locale, "C"));
-+ locale = std::setlocale(LC_CTYPE, nullptr);
-+// ASSERT_HOST(!strcmp(locale, "C"));
-+ locale = std::setlocale(LC_NUMERIC, nullptr);
-+// ASSERT_HOST(!strcmp(locale, "C"));
- }
-
- TessBaseAPI::~TessBaseAPI() {
-@@ -1333,6 +1341,85 @@ static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
- text->add_str_int("\t", bottom - top);
- }
-
-+/* willus mod */
-+int TessBaseAPI::GetOCRWords(int **x00,int **y00,int **x11,int **y11,int **ybaseline0,
-+ char **utf8words)
-+
-+ {
-+ int iword,nwords,totlen,it8;
-+ int *x0,*y0,*x1,*y1,*ybaseline;
-+ char *tutf8;
-+
-+ ResultIterator *res_it = GetIterator();
-+ /* Count words */
-+ iword=0;
-+ totlen=0;
-+ while (!res_it->Empty(RIL_BLOCK))
-+ {
-+ if (res_it->Empty(RIL_WORD))
-+ {
-+ res_it->Next(RIL_WORD);
-+ continue;
-+ }
-+ iword++;
-+ STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
-+ totlen+=strlen(textstr.string())+1;
-+ res_it->Next(RIL_WORD);
-+ }
-+ nwords=iword;
-+/*
-+printf("\nnwords=%d, totlen=%d\n",nwords,totlen);
-+*/
-+ x0=(*x00)=(int *)malloc(sizeof(int)*5*nwords);
-+ y0=(*y00)=&x0[nwords];
-+ x1=(*x11)=&y0[nwords];
-+ y1=(*y11)=&x1[nwords];
-+ ybaseline=(*ybaseline0)=&y1[nwords];
-+ tutf8=(*utf8words)=(char *)malloc(totlen);
-+ iword=0;
-+ it8=0;
-+ res_it->Begin();
-+ while (!res_it->Empty(RIL_BLOCK))
-+ {
-+ if (res_it->Empty(RIL_WORD))
-+ {
-+ res_it->Next(RIL_WORD);
-+ continue;
-+ }
-+ STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
-+ strcpy(&tutf8[it8],textstr.string());
-+ it8 += strlen(&tutf8[it8])+1;
-+ /*
-+ STRING textstr("");
-+ textstr += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
-+ */
-+/*
-+printf("Word %d: '%s'\n",iword,textstr.string());
-+*/
-+ int left, top, right, bottom;
-+ int u1,v1,u2,v2;
-+ res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
-+ res_it->Baseline(RIL_WORD, &u1, &v1, &u2, &v2);
-+ x0[iword]=left;
-+ x1[iword]=right;
-+ y0[iword]=top;
-+ y1[iword]=bottom;
-+ ybaseline[iword]=(v1+v2)/2;
-+ iword++;
-+/*
-+printf("BB: (%d,%d)-(%d,%d) BL: (%d,%d)-(%d,%d)\n",left,bottom,right,top,x1,y1,x2,y2);
-+*/
-+ res_it->Next(RIL_WORD);
-+ }
-+/*
-+printf("iword=%d\n",iword);
-+*/
-+ return(iword);
-+ }
-+
-+/* willus mod */
-+int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
-+
- /**
- * Make a TSV-formatted string from the internal data structures.
- * page_number is 0-based but will appear in the output as 1-based.
-diff --git a/src/api/baseapi.h b/src/api/baseapi.h
-index 3724dd92..23be5920 100644
---- a/src/api/baseapi.h
-+++ b/src/api/baseapi.h
-@@ -575,6 +575,9 @@ class TESS_API TessBaseAPI {
- */
- char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
-
-+/* willus mod */
-+int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
-+
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
-diff --git a/src/api/tesscapi.cpp b/src/api/tesscapi.cpp
-new file mode 100644
-index 00000000..1752fafe
---- /dev/null
-+++ b/src/api/tesscapi.cpp
-@@ -0,0 +1,311 @@
-+/*
-+** tesscapi.cpp willus.com attempt at C wrapper for tesseract.
-+** (Butchered from tesseractmain.cpp)
-+** Last udpated 9-1-12
-+**
-+** Copyright (C) 2012 http://willus.com
-+**
-+** This program is free software: you can redistribute it and/or modify
-+** it under the terms of the GNU Affero General Public License as
-+** published by the Free Software Foundation, either version 3 of the
-+** License, or (at your option) any later version.
-+**
-+** This program is distributed in the hope that it will be useful,
-+** but WITHOUT ANY WARRANTY; without even the implied warranty of
-+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+** GNU Affero General Public License for more details.
-+**
-+** You should have received a copy of the GNU Affero General Public License
-+** along with this program. If not, see <http://www.gnu.org/licenses/>.
-+**
-+*/
-+
-+/*
-+#include "mfcpch.h"
-+*/
-+// #define USE_VLD //Uncomment for Visual Leak Detector.
-+#if (defined _MSC_VER && defined USE_VLD)
-+#include <vld.h>
-+#endif
-+
-+// Include automatically generated configuration file if running autoconf
-+#ifdef HAVE_CONFIG_H
-+#include "config_auto.h"
-+#endif
-+#include <locale.h>
-+#ifdef USING_GETTEXT
-+#include <libintl.h>
-+#define _(x) gettext(x)
-+#else
-+#define _(x) (x)
-+#endif
-+
-+#include "allheaders.h"
-+#include "baseapi.h"
-+#include "strngs.h"
-+#include "params.h"
-+#include "blobs.h"
-+#include "simddetect.h"
-+#include "tesseractclass.h"
-+/*
-+#include "notdll.h"
-+*/
-+
-+/* C Wrappers */
-+#include "tesseract.h"
-+
-+// static tesseract::TessBaseAPI api[4];
-+
-+/*
-+** ocr_type=0: OEM_DEFAULT
-+** ocr_type=1: OEM_TESSERACT_ONLY
-+** ocr_type=2: OEM_LSTM_ONLY
-+** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED
-+*/
-+void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
-+ char *initstr,int maxlen,int *status)
-+
-+ {
-+ char original_locale[256];
-+ tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI;
-+/*
-+printf("@tess_capi_init\n");
-+printf(" datapath='%s'\n",datapath);
-+printf(" language='%s'\n",language);
-+printf(" ocr_type=%d\n",ocr_type);
-+*/
-+#ifdef USE_NLS
-+ setlocale (LC_ALL, "");
-+ bindtextdomain (PACKAGE, LOCALEDIR);
-+ textdomain (PACKAGE);
-+#endif
-+ /* willus mod, 11-24-16 */
-+ /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
-+/*
-+printf("locale='%s'\n",setlocale(LC_ALL,NULL));
-+printf("ctype='%s'\n",setlocale(LC_CTYPE,NULL));
-+printf("numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
-+*/
-+ strncpy(original_locale,setlocale(LC_ALL,NULL),255);
-+ original_locale[255]='\0';
-+/*
-+printf("original_locale='%s'\n",original_locale);
-+*/
-+ setlocale(LC_ALL,"C");
-+/*
-+printf("new locale='%s'\n",setlocale(LC_ALL,NULL));
-+printf("new ctype='%s'\n",setlocale(LC_CTYPE,NULL));
-+printf("new numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
-+*/
-+ // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
-+ // Make the order of args a bit more forgiving than it used to be.
-+ const char* lang = "eng";
-+ tesseract::PageSegMode pagesegmode = tesseract::PSM_SINGLE_BLOCK;
-+ if (language!=NULL && language[0]!='\0')
-+ lang = language;
-+ /*
-+ if (output == NULL)
-+ {
-+ fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
-+ "[-psm pagesegmode] [configfile...]\n"), argv[0]);
-+ fprintf(stderr,
-+ _("pagesegmode values are:\n"
-+ "0 = Orientation and script detection (OSD) only.\n"
-+ "1 = Automatic page segmentation with OSD.\n"
-+ "2 = Automatic page segmentation, but no OSD, or OCR\n"
-+ "3 = Fully automatic page segmentation, but no OSD. (Default)\n"
-+ "4 = Assume a single column of text of variable sizes.\n"
-+ "5 = Assume a single uniform block of vertically aligned text.\n"
-+ "6 = Assume a single uniform block of text.\n"
-+ "7 = Treat the image as a single text line.\n"
-+ "8 = Treat the image as a single word.\n"
-+ "9 = Treat the image as a single word in a circle.\n"
-+ "10 = Treat the image as a single character.\n"));
-+ fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
-+ "configfile.\n"));
-+ exit(1);
-+ }
-+ */
-+/*
-+printf("SSE = %s\n",SIMDDetect::IsSSEAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
-+printf("AVX = %s\n",SIMDDetect::IsAVXAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
-+*/
-+/*
-+v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE.
-+*/
-+ ocr_type=0; /* Ignore specified and use default */
-+ api->SetOutputName(NULL);
-+ (*status)=api->Init(datapath,lang,
-+ ocr_type==0 ? tesseract::OEM_DEFAULT :
-+ (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY :
-+ (ocr_type==2 ? tesseract::OEM_LSTM_ONLY :
-+ (tesseract::OEM_TESSERACT_LSTM_COMBINED))));
-+ if ((*status)!=0)
-+ {
-+ /* willus mod, 11-24-16 */
-+ setlocale(LC_ALL,original_locale);
-+ api->End();
-+ delete api;
-+ return(NULL);
-+ }
-+ /*
-+ api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
-+ &(argv[arg]), argc - arg, NULL, NULL, false);
-+ */
-+ // We have 2 possible sources of pagesegmode: a config file and
-+ // the command line. For backwards compatability reasons, the
-+ // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
-+ // default for this program is tesseract::PSM_AUTO. We will let
-+ // the config file take priority, so the command-line default
-+ // can take priority over the tesseract default, so we use the
-+ // value from the command line only if the retrieved mode
-+ // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
-+ // in any config file. Therefore the only way to force
-+ // tesseract::PSM_SINGLE_BLOCK is from the command line.
-+ // It would be simpler if we could set the value before Init,
-+ // but that doesn't work.
-+ if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
-+ api->SetPageSegMode(pagesegmode);
-+
-+ /*
-+ ** Initialization message
-+ */
-+ {
-+ char istr[1024];
-+ int sse,avx;
-+
-+// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
-+ sprintf(istr,"%s",api->Version());
-+ sse=tesseract::SIMDDetect::IsSSEAvailable();
-+ avx=tesseract::SIMDDetect::IsAVXAvailable();
-+ if (sse || avx)
-+ sprintf(&istr[strlen(istr)]," [%s]",sse&&avx?"SSE+AVX":(sse?"SSE":"AVX"));
-+ sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
-+ strcat(istr,"\n Tesseract languages: ");
-+ GenericVector<STRING> languages;
-+ api->GetLoadedLanguagesAsVector(&languages);
-+/*
-+printf("OEM=%d\n",api->oem());
-+printf("Langs='%s'\n",api->GetInitLanguagesAsString());
-+printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
-+printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
-+printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
-+printf("languages.size()=%d\n",(int)languages.size());
-+*/
-+
-+ for (int i=0;i<=api->tesseract()->num_sub_langs();i++)
-+ {
-+ tesseract::Tesseract *lang1;
-+ int eng;
-+ lang1 = i==0 ? api->tesseract() : api->tesseract()->get_sub_lang(i-1);
-+ eng=(int)lang1->tessedit_ocr_engine_mode;
-+ sprintf(&istr[strlen(istr)],"%s%s [%s]",i==0?"":", ",lang1->lang.string(),
-+ eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
-+ }
-+/*
-+printf("%d. '%s'\n",i+1,languages[i].string());
-+printf(" sublang[%d].oem_engine = %d\n",i+1,(int)api->tesseract()->get_sub_lang(i)->tessedit_ocr_engine_mode);
-+*/
-+
-+ /*
-+ if (ocr_type==0 || ocr_type==3)
-+ sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
-+ else if (ocr_type==2)
-+ sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
-+ strncpy(&istr[strlen(istr)],language,253-strlen(istr));
-+ istr[253]='\0';
-+ strcat(istr,")");
-+ */
-+ if (out!=NULL)
-+ fprintf(out,"%s\n",istr);
-+ if (initstr!=NULL)
-+ {
-+ strncpy(initstr,istr,maxlen-1);
-+ initstr[maxlen-1]='\0';
-+ }
-+ }
-+
-+
-+ /* Turn off LSTM debugging output */
-+ api->SetVariable("lstm_debug_level","0");
-+#if (WILLUSDEBUG & 1)
-+ api->SetVariable("lstm_debug_level","9");
-+ api->SetVariable("paragraph_debug_level","9");
-+ api->SetVariable("tessdata_manager_debug_level","9");
-+ api->SetVariable("tosp_debug_level","9");
-+ api->SetVariable("wordrec_debug_level","9");
-+ api->SetVariable("segsearch_debug_level","9");
-+#endif
-+ /* willus mod, 11-24-16 */
-+ setlocale(LC_ALL,original_locale);
-+ return((void *)api);
-+ }
-+
-+
-+int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
-+
-+ {
-+ tesseract::TessBaseAPI *api;
-+ static int old_segmode=-1;
-+
-+ api=(tesseract::TessBaseAPI *)vapi;
-+ if (old_segmode != segmode)
-+ {
-+ old_segmode=segmode;
-+ api->SetPageSegMode((tesseract::PageSegMode)segmode);
-+ }
-+ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
-+ {
-+ /* pixDestroy(&pix); */
-+ if (out!=NULL)
-+ fprintf(out,"tesscapi: Error during bitmap processing.\n");
-+ api->Clear();
-+ return(-1);
-+ }
-+ strncpy(outstr,api->GetUTF8Text(),maxlen-1);
-+ outstr[maxlen-1]='\0';
-+ api->Clear();
-+ return(0);
-+ }
-+
-+
-+int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
-+ int **left,int **top,int **right,int **bottom,
-+ int **ybase,char **text,int *nw,
-+ FILE *out)
-+
-+ {
-+ tesseract::TessBaseAPI *api;
-+ static int old_segmode=-1;
-+
-+ api=(tesseract::TessBaseAPI *)vapi;
-+ if (old_segmode != segmode)
-+ {
-+ old_segmode=segmode;
-+ api->SetPageSegMode((tesseract::PageSegMode)segmode);
-+ }
-+ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
-+ {
-+ if (out!=NULL)
-+ fprintf(out,"tesscapi: Error during bitmap processing.\n");
-+ api->Clear();
-+ (*nw)=0;
-+ return(-1);
-+ }
-+ (*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
-+ api->Clear();
-+ return(0);
-+ }
-+
-+
-+void tess_capi_end(void *vapi)
-+
-+ {
-+ tesseract::TessBaseAPI *api;
-+
-+ if (vapi==NULL)
-+ return;
-+ api=(tesseract::TessBaseAPI *)vapi;
-+ api->End();
-+ delete api;
-+ }
-diff --git a/src/api/tesseract.h b/src/api/tesseract.h
-new file mode 100644
-index 00000000..575948cc
---- /dev/null
-+++ b/src/api/tesseract.h
-@@ -0,0 +1,29 @@
-+/*
-+** Willus.com's Tesseract C Wrappers
-+**
-+** 6-8-12
-+**
-+*/
-+
-+#ifndef _TESSERACT_H_
-+#define _TESSERACT_H_
-+
-+//#include <leptonica.h>
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
-+ char *initstr,int maxlen,int *status);
-+int tess_capi_get_ocr(void *api,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out);
-+int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
-+ int **left,int **top,int **right,int **bottom,
-+ int **ybase,char **text,int *nw,
-+ FILE *out);
-+void tess_capi_end(void *api);
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif
-diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp
-index 17f0951b..7af94ee2 100644
---- a/src/ccmain/tessedit.cpp
-+++ b/src/ccmain/tessedit.cpp
-@@ -101,6 +101,10 @@ bool Tesseract::init_tesseract_lang_data(
- " to your \"tessdata\" directory.\n");
- return false;
- }
-+ /* willus mod */
-+ TFile fp;
-+ strncpy(fp.tfile_filename,tessdata_path.string(),511);
-+ fp.tfile_filename[511]='\0';
- #ifndef DISABLED_LEGACY_ENGINE
- if (oem == OEM_DEFAULT) {
- // Set the engine mode from availability, which can then be overridden by
-@@ -116,7 +120,6 @@ bool Tesseract::init_tesseract_lang_data(
- #endif // ndef DISABLED_LEGACY_ENGINE
-
- // If a language specific config file (lang.config) exists, load it in.
-- TFile fp;
- if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
- ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
- this->params());
-diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h
-index 71e89c60..bdeccc14 100644
---- a/src/ccutil/ccutil.h
-+++ b/src/ccutil/ccutil.h
-@@ -80,6 +80,13 @@ class CCUtil {
- // Member parameters.
- // These have to be declared and initialized after params_ member, since
- // params_ should be initialized before parameters are added to it.
-+/* willus mod */
-+/*
-+ #ifdef _WIN32
-+ STRING_VAR_H(tessedit_module_name, WINDLLNAME,
-+ "Module colocated with tessdata dir");
-+ #endif
-+*/
- INT_VAR_H(ambigs_debug_level, 0, "Debug level for unichar ambiguities");
- BOOL_VAR_H(use_definite_ambigs_for_classifier, false,
- "Use definite ambiguities when running character classifier");
-diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h
-index 3556d153..3a5e8662 100644
---- a/src/ccutil/genericvector.h
-+++ b/src/ccutil/genericvector.h
-@@ -382,7 +382,26 @@ inline bool LoadDataFromFile(const char* filename, GenericVector<char>* data) {
- // reserve an extra byte in case caller wants to append a '\0' character
- data->reserve(size + 1);
- data->resize_no_init(size);
-- result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
-+ /* willus mod Dec 2018--weird issue with Win XP and MinGW gcc 7.3.0 */
-+ /* Can't read entire file at once -- need to break up into smaller blocksize reads */
-+ {
-+ int frs,n;
-+ int blocksize;
-+ blocksize=1024*1024;
-+ for (n=0;1;)
-+ {
-+ int bs;
-+ bs= size-n > blocksize ? blocksize : size-n;
-+ frs=(int)fread(&(*data)[n],1,bs,fp);
-+ n+=frs;
-+ if (frs<bs || bs<blocksize || n>=size)
-+ break;
-+ }
-+ result = static_cast<long>((long)n==size);
-+ }
-+ /*
-+ result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
-+ */
- }
- fclose(fp);
- }
-diff --git a/src/ccutil/mainblk.cpp b/src/ccutil/mainblk.cpp
-index 52b04b04..80b26044 100644
---- a/src/ccutil/mainblk.cpp
-+++ b/src/ccutil/mainblk.cpp
-@@ -55,8 +55,22 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
- #if defined(_WIN32)
- } else if (datadir == nullptr || _access(datadir.string(), 0) != 0) {
- /* Look for tessdata in directory of executable. */
-+ /*
-+ char drive[_MAX_DRIVE];
-+ char dir[_MAX_DIR];
-+ */
- char path[_MAX_PATH];
-- DWORD length = GetModuleFileName(nullptr, path, sizeof(path));
-+ int i;
-+ /* DWORD length = */ GetModuleFileName(nullptr, path, sizeof(path));
-+ /* willus mod--avoid _splitpath_s -- not in XP */
-+ for (i=strlen(path)-1;i>=0 && path[i]!='/' && path[i]!='\\';i--);
-+ if (i>=0)
-+ {
-+ path[i]='\0';
-+ datadir=path;
-+ datadir += "/tessdata";
-+ }
-+ /*
- if (length > 0 && length < sizeof(path)) {
- char* separator = std::strrchr(path, '\\');
- if (separator != nullptr) {
-@@ -65,6 +79,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
- datadir += "/tessdata";
- }
- }
-+ */
- #endif /* _WIN32 */
- #if defined(TESSDATA_PREFIX)
- } else {
-diff --git a/src/ccutil/params.cpp b/src/ccutil/params.cpp
-index 00bf2563..486c5ce0 100644
---- a/src/ccutil/params.cpp
-+++ b/src/ccutil/params.cpp
-@@ -82,7 +82,8 @@ bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
-
- if (!foundit) {
- anyerr = true; // had an error
-- tprintf("Warning: Parameter not found: %s\n", line);
-+ /* willus mod */
-+ tprintf("Tesseract warning: Parameter %s not found in file %s.\n",line,fp->tfile_filename);
- }
- }
- }
-diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp
-index 7def011f..6107a494 100644
---- a/src/ccutil/serialis.cpp
-+++ b/src/ccutil/serialis.cpp
-@@ -201,6 +201,9 @@ bool TFile::Open(const STRING& filename, FileReader reader) {
- offset_ = 0;
- is_writing_ = false;
- swap_ = false;
-+ /* willus mod */
-+ strncpy(tfile_filename,filename.string(),511);
-+ tfile_filename[511]='\0';
- if (reader == nullptr)
- return LoadDataFromFile(filename, data_);
- else
-diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h
-index 095b9227..4cc8251e 100644
---- a/src/ccutil/serialis.h
-+++ b/src/ccutil/serialis.h
-@@ -77,6 +77,8 @@ class TFile {
- public:
- TFile();
- ~TFile();
-+ /* willus mod */
-+ char tfile_filename[512];
-
- // All the Open methods load the whole file into memory for reading.
- // Opens a file with a supplied reader, or nullptr to use the default.
-diff --git a/src/lstm/input.cpp b/src/lstm/input.cpp
-index 73b584b3..0b0b54c3 100644
---- a/src/lstm/input.cpp
-+++ b/src/lstm/input.cpp
-@@ -93,8 +93,11 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
- return nullptr;
- }
- if (width < min_width || height < min_width) {
-+ /* willus mod -- no warning */
-+ /*
- tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width,
- height, min_width);
-+ */
- pixDestroy(&pix);
- return nullptr;
- }
---
-2.22.0
-