English | ä¸ć–‡
PHP Tesseract OCR is a C++ extension of PHP for character recognition and OCR learning in PHP environment.
This article only introduces the installation of PHP-CPP and PHP TesseractOCR. For Tesseract installation and other configuration, please refer to this article: http://www.5lazy.cn/post-141.html
- Linux, OS X does not support windows for the time being
- PHP7 or later
- tesseract 4.0.0 or latest
- PHP-CPP2.1.2 or later
- GCC 4.8 or later
âš Before installation, add php-config of the current PHP environment to the environment variable.
git clone https://github.com/CopernicaMarketingSoftware/PHP-CPP.git
cd PHP-CPP
make
sudo make install
If there are multiple warnings in OSX compilation, please ignore them
âš To do this, make sure that Tesseract version 4.0.0 or above is installed.
git clone https://github.com/2654709623/php-tesseract-ocr.git
cd php-tesseract-ocr
make
sudo make install
If there are multiple warnings in OSX compilation, please ignore them
Because the function of PHPTesseractOCR is not a standard function of PHP, IDE can not be automatically completed. In order to facilitate development, you can click here to download IDE assistant, which can be automatically completed under IDE.
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$text=$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')
->getUTF8Text();
echo $text;
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png');
$tesseract->getComponentImages('RIL_WORD',function ($x,$y,$w,$h,$text){
echo "Result:{$text}X:{$x}Y:{$y}Width:{$w}Height:{$h}";
echo '<br>';
});
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')->recognize(0);
$tesseract->getIterator('RIL_TEXTLINE',function ($text,$x1,$y1,$x2,$y2){
echo "Text:{$text}X1:{$x1}Y1:{$y1}X2:{$x2}Y2:{$y2}";
echo '<br>';
});
echo $tesseract->getUTF8Text();
Help to improve recognition speed
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$text=$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100)
->getUTF8Text();
echo $text;
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->recognize(0)
->analyseLayout()
echo $tesseract->getUTF8Text();
Setting additional parameters
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
//Example1
$tesseract->setVariable('save_blob_choices','T');
//Example2
$tesseract->setVariable('tessedit_char_whitelist','0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ');
//Example3
$tesseract->setVariable('tessedit_char_blacklist','xyz');
setVariable Options Reference:http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
Tesseract initialization
Traineddata download:https://github.com/tesseract-ocr/tessdata
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
//Traineddata directory must / end
$tesseract->setVariable('save_blob_choices','T')->init(__DIR__.'/traineddata/tessdata-fast/','eng');
//Multiple languages
$tesseract->setVariable('save_blob_choices','T')->init(__DIR__.'/traineddata/tessdata-fast/','eng+chi_sim');
//Setting Engine Mode
$tesseract->setVariable('save_blob_choices','T')->init(__DIR__.'/traineddata/tessdata-raw/','eng','OEM_TESSERACT_LSTM_COMBINED');
Engine Mode Options:
- OEM_DEFAULT(Default, based on what is available.)
- OEM_LSTM_ONLY(Neural nets LSTM engine only.)
- OEM_TESSERACT_LSTM_COMBINED(Legacy + LSTM engines.)
- OEM_TESSERACT_ONLY(Legacy engine only.)
Setting Paging Mode
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO');
PageSegMode Options Reference:https://rmtheis.github.io/tess-two/javadoc/com/googlecode/tesseract/android/TessBaseAPI.PageSegMode.html
Setting Recognition Pictures
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
//Support png, jpg, jpeg, tif, webp format
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png');
Setting image recognition area
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100);
After Recognize, the output is kept internally until the next SetImage
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100)
//For the time being, only 0 or null is supported.
->recognize(0);
Application Paging Layout
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100)
->recognize(0)
->analyseLayout();
Get page layout analysis
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100)
->recognize(0)
->analyseLayout()
->orientation($orientation,$writingDirection,$textlineOrder,$deskewAngle);
Search for text blocks
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png');
$tesseract->getComponentImages('RIL_WORD',function ($x,$y,$w,$h,$text){
echo "Result:{$text}X:{$x}Y:{$y}Width:{$w}Height:{$h}";
echo '<br>';
});
PageIteratorLevel Options:
- RIL_BLOCK(Block of text/image/separator line.)
- RIL_PARA(Paragraph within a block.)
- RIL_TEXTLINE(Line within a paragraph.)
- RIL_WORD(Word within a textline.)
- RIL_SYMBOL(Symbol/character within a word.)
Get result iterator
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')->recognize(0);
$tesseract->getIterator('RIL_TEXTLINE',function ($text,$x1,$y1,$x2,$y2){
echo "Text:{$text}X1:{$x1}Y1:{$y1}X2:{$x2}Y2:{$y2}";
echo '<br>';
});
See getComponentImages for parameters
Get UTF8 characters
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$text=$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')
->getUTF8Text();
echo $text;
Free up recognition results and any stored image data
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
//Three images were recognized normally.
for($i=1;$i<=3;$i++){
$tesseract->setImage(__DIR__.'/img/'.$i.'.png')
echo $tesseract->getUTF8Text();
}
//Only one can be identified.
for($i=1;$i<=3;$i++){
$tesseract->setImage(__DIR__.'/img/'.$i.'.png')
echo $tesseract->getUTF8Text();
$tesseract->clear();
}
Get php tesseract version
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
echo $tesseract->version();
Get tesseract version
use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
echo $tesseract->tesseract();
Apache License Version 2.0 see http://www.apache.org/licenses/LICENSE-2.0.html