Skip to content

Commit

Permalink
Merge pull request #8 from jolicode/contentEncoding
Browse files Browse the repository at this point in the history
0.1.4 release with content encoding improvements
  • Loading branch information
damienalexandre committed Jun 17, 2014
2 parents ebb9cf2 + c87c681 commit 8539ddb
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 23 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
CHANGELOG
=========

### 0.1... (????-??-??) ###
### ??? ###

### 0.1.4 (2014-06-17) ###

* add HHVM tests on travis
* add libxml to composer requirements
* set APC 3.1.11 as conflict (https://bugs.php.net/bug.php?id=62190)
* do not process empty contents
* apply `mb_convert_encoding($content, 'HTML-ENTITIES', $encoding)` on all contents to fix encoding
* workaround for old (2.6.32) libxml versions (#7)
* better Org_Heigl_Hyphenator version requirement

### 0.1.3 (2013-11-15) ###

Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ It's designed to be:
Quick usage
===========

Just tell the Fixer class [which Fixer](#available-fixers) you want to run on your HTML content and then, call `fix()`:
Just tell the Fixer class [which Fixer](#available-fixers) you want to run on your **HTML contents** and then, call `fix()`:

```php
use JoliTypo\Fixer;
Expand All @@ -50,9 +50,13 @@ $fixed_content = $fixer->fix('<p>Je suis "très content" de t\'avoir invité sur
For your ease of use, you can find [ready to use list of Fixer for your language here](#fixer-recommendations-by-locale).
Micro-typography is nothing like a standard or a law, what really matter is consistency, so feel free to use your own lists.

Also, be advise that JoliTypo is intended to be used on HTML contents (not pages) and will remove potential `<head>`, `<html>` and `<body>` tags.

Installation
============

Requirements are handled by Composer (libxml and mbstring are required).

```
composer require jolicode/jolitypo 0.1.*
```
Expand Down
4 changes: 2 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
"require": {
"php": ">=5.3.0",
"ext-mbstring": "*",
"lib-libxml": ">2.6.32",
"org_heigl/hyphenator": ">=2.0.3"
"lib-libxml": "*",
"org_heigl/hyphenator": "~2.0.3"
},
"conflict": {
"ext-apc": "3.1.11"
Expand Down
66 changes: 49 additions & 17 deletions src/JoliTypo/Fixer.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,15 @@ public function __construct($rules)
*/
public function fix($content)
{
$trimmed = trim($content);
if (empty($trimmed)) {
return $content;
}

// Get a clean new StateBag
$this->state_bag = new StateBag();

$dom = $this->loadDOMDocument($content);
$dom = $this->loadDOMDocument($trimmed);

$this->processDOM($dom, $dom);

Expand Down Expand Up @@ -201,30 +206,57 @@ private function loadDOMDocument($content)
$dom->substituteEntities = false;
$dom->formatOutput = false;

// Change mb and libxml config
$libxml_current = libxml_use_internal_errors(true);
$mb_detect_current = mb_detect_order();
mb_detect_order("ASCII,UTF-8,ISO-8859-1,windows-1252,iso-8859-15");

// Little hack to force UTF-8
if (strpos($content, '<?xml encoding') === false) {
$hack = strpos($content, '<body') === false ? '<?xml encoding="UTF-8"><body>' : '<?xml encoding="UTF-8">';
$loaded = $dom->loadHTML($hack . $content);
} else {
$loaded = $dom->loadHTML($content);
}
$loaded = $dom->loadHTML($this->fixContentEncoding($content));

// Restore mb and libxml config
libxml_use_internal_errors($libxml_current);
mb_detect_order(implode(',', $mb_detect_current));

if (!$loaded) {
throw new InvalidMarkupException("Can't load the given HTML via DomDocument");
}

foreach ($dom->childNodes as $item) {
if ($item->nodeType === XML_PI_NODE) {
$dom->removeChild($item); // remove encoding hack
break;
}
return $dom;
}

/**
* Convert the content encoding properly and add Content-Type meta if HTML document
*
* @see http://php.net/manual/en/domdocument.loadhtml.php#91513
* @see https://github.com/jolicode/JoliTypo/issues/7
*
* @param $content
* @return string
*/
private function fixContentEncoding($content)
{
if (!empty($content)) {
// Little hack to force UTF-8
if (strpos($content, '<?xml encoding') === false) {
$hack = strpos($content, '<body') === false ? '<?xml encoding="UTF-8"><body>' : '<?xml encoding="UTF-8">';
$content = $hack . $content;
}

$encoding = mb_detect_encoding($content);
$headpos = mb_strpos($content, '<head>');

// Add a meta to the <head> section
if (false !== $headpos) {
$headpos +=6;
$content = mb_substr($content, 0, $headpos) .
'<meta http-equiv="Content-Type" content="text/html; charset='.$encoding.'">' .
mb_substr($content, $headpos);
}

$content = mb_convert_encoding($content, 'HTML-ENTITIES', $encoding);
}

return $dom;
return $content;
}

/**
Expand All @@ -235,9 +267,9 @@ private function exportDOMDocument(\DOMDocument $dom)
{
// Remove added body & doctype
$content = preg_replace(array(
"/^\<\!DOCTYPE.*?<html><body>/si",
"!</body></html>$!si"),
"", $dom->saveHTML());
"/^\<\!DOCTYPE.*?<html>.*?<body>/si",
"!</body></html>$!si"
), "", $dom->saveHTML());

return trim($content);
}
Expand Down
25 changes: 25 additions & 0 deletions tests/JoliTypo/Tests/Html5Test.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,29 @@ public function testHtml5Markup()
// The test passes if there is no warning about this fix:
$this->assertEquals($html5, $fixer->fix($html5));
}

public function testFullPageMarkup()
{
$fixer = new Fixer(array(new Fixer\EnglishQuotes()));
$this->assertInstanceOf('JoliTypo\Fixer', $fixer);

$html = <<<HTML
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8" />
<title>Coucou</title>
</head>
<body>
"Who Let the Dogs Out?" is a song written and originally recorded by Anslem Douglas (titled "Doggie").
</body>
</html>
HTML;

$fixed = <<<STRING
&#8220;Who Let the Dogs Out?&#8221; is a song written and originally recorded by Anslem Douglas (titled &#8220;Doggie&#8221;).
STRING;

$this->assertEquals($fixed, $fixer->fix($html));
}
}
14 changes: 12 additions & 2 deletions tests/JoliTypo/Tests/JoliTypoTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,22 @@ public function testBadEncoding()

$this->assertEquals("Mentions L&eacute;gales", $fixer->fix(utf8_encode(utf8_decode("Mentions Légales"))));

// JoliTypo can't handle double encoded UTF-8 strings, nor ISO strings
// JoliTypo can handle double encoded UTF-8 strings, or ISO strings, but that's not a feature.
$isoString = mb_convert_encoding("Mentions Légales", "ISO-8859-1", "UTF-8");
$this->assertEquals("Mentions L&eacute;gales", $fixer->fix(utf8_encode($isoString)));
$this->assertNotEquals("Mentions L&eacute;gales", $fixer->fix($isoString));
$this->assertEquals("Mentions L&eacute;gales", $fixer->fix($isoString));
$this->assertEquals("Mentions L&Atilde;&copy;gales", $fixer->fix(utf8_encode(utf8_encode($isoString))));
}

public function testEmptyContent()
{
$fixer = new Fixer(array('Trademark'));
$this->assertInstanceOf('JoliTypo\Fixer', $fixer);

$this->assertEquals("", $fixer->fix(""));
$this->assertEquals("\n ", $fixer->fix("\n "));
$this->assertEquals("some content", $fixer->fix("\n some content"));
}
}

class FakeFixer {}
Expand Down

0 comments on commit 8539ddb

Please sign in to comment.