How can i save in 2 different formats?
Hello,
how can i save the TEXT and the HOCR output on the same time?
At the time i run cuneiform two times to do the job.
cuneiform -l ger -f hocr -o $FILE.html $FILE.tiff
cuneiform -l ger -f text -o $FILE.text $FILE.tiff
Question information
- Language:
- English Edit question
- Status:
- Solved
- Assignee:
- No assignee Edit question
- Last query:
- Last reply:
Related FAQ:
None Link to a FAQ
This question was reopened
Revision history for this message
![]() |
#1 |
No way until you write a patch... :-(
Revision history for this message
![]() |
#2 |
Thanks Yury V. Zaytsev, that solved my question.
Revision history for this message
![]() |
#3 |
I have add a patch for save additional als plain text.
So the user can save in the -f Format and additional in plaintext for debug or something else.
do you want to add this in the codebase?
Changes:
-------
Bool32 saveplaintext = FALSE;
-------
} else if(strcmp(argv[i], "--saveplaintext") == 0) {
-------
cout << "Usage: " << argv[0] << "[-l languagename -f format --dotmatrix --fax --saveplaintext -o result_file] imagefile\n";
-------
if(
cerr << "PUMA_XSave failed.\n";
return 1;
}
if (saveplaintext) {
outfilename += ".txt";
cerr << "PUMA_XSave failed.\n";
return 1;
}
}
-------
-------
cuneiform-cli.cpp 0.7.0
-------
/*
Copyright (c) 2008, 2009 Jussi Pakkanen
Ðàçðåøàåòñÿ ïîâòîðíîå ðàñïðîñòðàíåíèå è èñïîëüçîâàíèå êàê â âèäå èñõîäíîãî êîäà,
òàê è â äâîè÷íîé ôîðìå, ñ èçìåíåíèÿìè èëè áåç, ïðè ñîáëþäåíèè ñëåäóþùèõ óñëîâèé:
* Ïðè ïîâòîðíîì ðàñïðîñòðàíåíèè èñõîäíîãî êîäà äîëæíû îñòàâàòüñÿ óêàçàííîå
âûøå óâåäîìëåíèå îá àâòîðñêîì ïðàâå, ýòîò ñïèñîê óñëîâèé è ïîñëåäóþùèé
îòêàç îò ãàðàíòèé.
* Ïðè ïîâòîðíîì ðàñïðîñòðàíåíèè äâîè÷íîãî êîäà â äîêóìåíòàöèè è/èëè â
äðóãèõ ìàòåðèàëàõ, ïîñòàâëÿåìûõ ïðè ðàñïðîñòðàíåíèè, äîëæíû ñîõðàíÿòüñÿ
óêàçàííàÿ âûøå èíôîðìàöèÿ îá àâòîðñêîì ïðàâå, ýòîò ñïèñîê óñëîâèé è
ïîñëåäóþùèé îòêàç îò ãàðàíòèé.
* Íè íàçâàíèå Cognitive Technologies, íè èìåíà åå ñîòðóäíèêîâ íå ìîãóò
áûòü èñïîëüçîâàíû â êà÷åñòâå ñðåäñòâà ïîääåðæêè è/èëè ïðîäâèæåíèÿ
ïðîäóêòîâ, îñíîâàííûõ íà ýòîì ÏÎ, áåç ïðåäâàðèòåëüíîãî ïèñüìåííîãî
ðàçðåøåíèÿ.
ÝÒÀ ÏÐÎÃÐÀÌÌÀ ÏÐÅÄÎÑÒÀÂËÅÍÀ ÂËÀÄÅËÜÖÀÌÈ ÀÂÒÎÐÑÊÈÕ ÏÐÀÂ È/ÈËÈ ÄÐÓÃÈÌÈ ËÈÖÀÌÈ "ÊÀÊ
ÎÍÀ ÅÑÒÜ" ÁÅÇ ÊÀÊÎÃÎ-ËÈÁÎ ÂÈÄÀ ÃÀÐÀÍÒÈÉ, ÂÛÐÀÆÅÍÍÛÕ ßÂÍÎ ÈËÈ ÏÎÄÐÀÇÓÌÅÂÀÅÌÛÕ,
ÂÊËÞ×Àß ÃÀÐÀÍÒÈÈ ÊÎÌÌÅÐ×ÅÑÊÎÉ ÖÅÍÍÎÑÒÈ È ÏÐÈÃÎÄÍÎÑÒÈ ÄËß ÊÎÍÊÐÅÒÍÎÉ ÖÅËÈ, ÍÎ ÍÅ
ÎÃÐÀÍÈ×ÈÂÀßÑÜ ÈÌÈ. ÍÈ ÂËÀÄÅËÅÖ ÀÂÒÎÐÑÊÈÕ ÏÐÀÂ È ÍÈ ÎÄÍÎ ÄÐÓÃÎÅ ËÈÖÎ, ÊÎÒÎÐÎÅ
ÌÎÆÅÒ ÈÇÌÅÍßÒÜ È/ÈËÈ ÏÎÂÒÎÐÍÎ ÐÀÑÏÐÎÑÒÐÀÍßÒÜ ÏÐÎÃÐÀÌÌÓ, ÍÈ Â ÊÎÅÌ ÑËÓ×ÀÅ ÍÅ
ÍÅÑšÒ ÎÒÂÅÒÑÒÂÅÍÍÎÑÒÈ, ÂÊËÞ×Àß ËÞÁÛÅ ÎÁÙÈÅ, ÑËÓ×ÀÉÍÛÅ, ÑÏÅÖÈÀËÜÍÛÅ ÈËÈ
ÏÎÑËÅÄÎÂÀÂØÈÅ ÓÁÛÒÊÈ, ÑÂßÇÀÍÍÛÅ Ñ ÈÑÏÎËÜÇÎÂÀÍÈÅÌ ÈËÈ ÏÎÍÅÑÅÍÍÛÅ ÂÑËÅÄÑÒÂÈÅ
ÍÅÂÎÇÌÎÆÍÎÑÒÈ ÈÑÏÎËÜÇÎÂÀÍÈß ÏÐÎÃÐÀÌÌÛ (ÂÊËÞ×Àß ÏÎÒÅÐÈ ÄÀÍÍÛÕ, ÈËÈ ÄÀÍÍÛÅ,
ÑÒÀÂØÈÅ ÍÅÃÎÄÍÛÌÈ, ÈËÈ ÓÁÛÒÊÈ È/ÈËÈ ÏÎÒÅÐÈ ÄÎÕÎÄÎÂ, ÏÎÍÅÑÅÍÍÛÅ ÈÇ-ÇÀ ÄÅÉÑÒÂÈÉ
ÒÐÅÒÜÈÕ ËÈÖ È/ÈËÈ ÎÒÊÀÇÀ ÏÐÎÃÐÀÌÌÛ ÐÀÁÎÒÀÒÜ ÑÎÂÌÅÑÒÍÎ Ñ ÄÐÓÃÈÌÈ ÏÐÎÃÐÀÌÌÀÌÈ,
ÍÎ ÍÅ ÎÃÐÀÍÈ×ÈÂÀßÑÜ ÝÒÈÌÈ ÑËÓ×ÀßÌÈ), ÍÎ ÍÅ ÎÃÐÀÍÈ×ÈÂÀßÑÜ ÈÌÈ, ÄÀÆÅ ÅÑËÈ ÒÀÊÎÉ
ÂËÀÄÅËÅÖ ÈËÈ ÄÐÓÃÎÅ ËÈÖÎ ÁÛËÈ ÈÇÂÅÙÅÍÛ Î ÂÎÇÌÎÆÍÎÑÒÈ ÒÀÊÈÕ ÓÁÛÒÊÎÂ È ÏÎÒÅÐÜ.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the Cognitive Technologies nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* This is a simple command line program for the Puma library. */
#include"
#include<iostream>
#include<sstream>
#include<iomanip>
#include<stdio.h>
#include<stdlib.h>
#include<cstring>
#include"cttypes.h"
#include"puma.h"
#include"config.h"
using namespace std;
struct langlist {
int puma_number;
const char *name;
};
/* Language codes according to ISO 639-2.
*/
static const langlist langs[] = {
// {PUMA_LANG_DIG, "dig"}, // This probably means "recognize digits only".
// {PUMA_LANG_UZBEK, "uzb"}, // These don't seem to have data files. Thus they are disabled.
// {PUMA_LANG_KAZ, "kaz"},
// {PUMA_LANG_KAZ_ENG, "kazeng"},
{-1, NULL}
};
struct formatlist {
int puma_number;
const char * name;
const char * descr;
};
static const formatlist formats[] = {
// Does not work. {PUMA_TOTABLEDBF, "dbf", "DBF format"},
{PUMA_TOHTML, "html", "HTML format"},
{PUMA_TOHOCR, "hocr", "hOCR HTML format"},
{PUMA_
{PUMA_TORTF, "rtf", "RTF format"},
{PUMA_
{PUMA_TOTEXT, "text", "plain text"},
// Table code is missing. {PUMA_TOTABLETXT, "tabletxt", ""},
{-1, NULL}
};
static string supported_
ostringstream os;
os << "Supported languages:";
for(const langlist *l = langs; l->puma_number >= 0; l++)
os << " " << l->name;
os << ".\n";
return os.str();
}
static string supported_formats() {
ostringstream os;
os << "Supported formats:\n";
for(const formatlist * f = formats; f->puma_number >= 0; f++)
os << " " << setiosflags(
return os.str();
}
/**
* Read file and return it as a BMP DIB entity. On failure write an error
* and return NULL. Caller delete[]'s the returned result.
*/
static char* read_file(const char *fname);
#ifdef USE_MAGICK
#include <Magick++.h>
using namespace Magick;
static char* read_file(const char *fname) {
Blob blob;
size_t data_size;
char *dib;
try {
Image image(fname);
// Write to BLOB in BMP format
} catch(Exception &error_) {
cerr << error_.what() << "\n";
return NULL;
}
data_size = blob.length();
dib = new char[data_size];
memcpy(dib, blob.data(), data_size);
return dib;
}
#else // No ImageMagick++
static char* read_file(const char *fname) {
char bmpheader[2];
char *dib;
FILE *f;
int32_t dibsize, offset;
f = fopen(fname, "rb");
if (!f) {
cerr << "Could not open file " << fname << ".\n";
return NULL;
}
fread(
if (bmpheader[0] != 'B' || bmpheader[1] != 'M') {
cerr << fname << " is not a BMP file.\n";
return NULL;
}
fread(&dibsize, sizeof(int32_t), 1, f);
fread(
fread(
fread(&offset, sizeof(int32_t), 1, f);
dibsize -= ftell(f);
dib = new char[dibsize];
fread(dib, dibsize, 1, f);
fclose(f);
if (*((int32_t*)dib) != 40) {
cerr << "BMP is not of type \"Windows V3\", which is the only supported format.\n";
cerr << "Please convert your BMP to uncompressed V3 format and try again.\n";
delete[] dib;
return NULL;
}
if (*((int32_t*) (dib+16)) != 0) {
cerr << fname << "is a compressed BMP. Only uncompressed BMP files are supported.\n";
cerr << "Please convert your BMP to uncompressed V3 format and try again.";
delete[] dib;
return NULL;
}
return dib;
}
#endif // USE_MAGICK
int main(int argc, char **argv) {
char *dib;
const char *infilename = NULL;
int langcode = PUMA_LANG_ENGLISH; // By default recognize plain english text.
Bool32 dotmatrix = FALSE;
Bool32 fax = FALSE;
Bool32 saveplaintext = FALSE;
const char *defaultnamestem = "cuneiform-out.";
string outfilename;
Int32 outputformat = PUMA_TOTEXT;
cout << "Cuneiform for Linux " << CF_VERSION << "\n";
/* Parsing command line parameters. */
for(int i=1; i<argc; i++) {
/* Changing language. */
if(++i >= argc) {
}
for(int j=0; langs[j]
}
}
}
} else if(strcmp(argv[i], "-f") == 0) {
if(++i >= argc) {
}
for(int j=0; formats[
}
}
}
} else if(strcmp(argv[i], "-o") == 0) {
if(++i >= argc) {
}
} else if(strcmp(argv[i], "--dotmatrix") == 0) {
} else if(strcmp(argv[i], "--fax") == 0) {
fax = TRUE;
} else if(strcmp(argv[i], "--saveplaintext") == 0) {
} else {
/* No switches, so set input file. */
infilename = argv[i];
}
}
if (outfilename.
outfilename = defaultnamestem;
switch (outputformat) {
case PUMA_TOHOCR:
case PUMA_TOHTML:
case PUMA_TORTF:
case PUMA_TOTEXT:
case PUMA_TOSMARTTEXT:
case PUMA_TOTABLETXT:
case PUMA_TOEDNATIVE:
case PUMA_TOTABLEDBF:
}
}
if(infilename == NULL) {
cout << "Usage: " << argv[0] << "[-l languagename -f format --dotmatrix --fax --saveplaintext -o result_file] imagefile\n";
return 0;
}
dib = read_file(
if(!dib) // Error msg is already printed so just get out.
return 1;
if(
cerr << "PUMA_Init failed.\n";
return 1;
}
// Set the language.
PUMA_
PUMA_
PUMA_
if(
cerr << "PUMA_Xopen failed.\n";
return 1;
}
/* From recogpuma.cpp
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
LPUMA_
rc = LPUMA_XFinalRec
*/
/* This seems to work (at least not crash). But since it is untested and
* I am not sure what is the proper function calling convention, it
* is disabled.
*/
/*
Bool32 singlecolumn = FALSE;
PUMA_
if(
return 1;
}
*/
if(
cerr << "PUMA_XFinalrec
return 1;
}
if(
cerr << "PUMA_XSave failed.\n";
return 1;
}
if (saveplaintext) {
outfilename += ".txt";
cerr << "PUMA_XSave failed.\n";
return 1;
}
}
if(
cerr << "PUMA_XClose failed.\n";
return 1;
}
if(
cerr << "PUMA_Done failed.\n";
return 1;
}
delete []dib;
return 0;
}
Revision history for this message
![]() |
#4 |
This question was expired because it remained in the 'Open' state without activity for the last 15 days.
Revision history for this message
![]() |
#5 |
I have created a bug report for you. The answer tracker shouldn't be used for such kind of requests.