forked from AKSW/csvimport.ontowiki
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CsvParser.php
206 lines (177 loc) · 6.63 KB
/
CsvParser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
<?php
require_once 'lib/IteratorReader.php';
/**
* @category component
* @package csvimport
* @author Michael Martin [email protected]
*/
class CsvParser
{
//a constant
const error_character = '\\uFFFD';
/**
* readed csvFile represented ar associated array
* @var array
* @access private
* @author Michael Martin [email protected]
*/
private $csvMap;
/**
* This is the constructor.It try to open the csv file.The method throws an exception
* on failure.
*
* @access public
* @param str $fileName The csv file.
* @author Michael Martin [email protected]
*
* @throws Exception
*/
public function __construct($fileName = "", $separator = ',', $useHeaders = false ) {
//preventing some limitations
ini_set("max_execution_time","600");
ini_set("memory_limit","1536M");
ini_set("auto_detect_line_endings",TRUE);
//initialising some class attributes
$this->csvMap = array();
//parse Map and check status
$this->csvMap = $this->readCSV($fileName, $separator, $useHeaders);
if( empty ($this->csvMap) )
throw new Exception( 'The file "'.$fileName.'" cannot be readed or is empty.' );
}
/**
* Getter of the CSV Map
*
* @access public
* @return array $csvMap.
*/
public function getParsedFile () {
return $this->csvMap;
}
#########################################################
# Private Functions
#########################################################
/**
* It try to open the csv file.The method throws an exception
*
* @access private
* @param str $fileName The csv file.
*/
private function readCSV($fileName, $separator = ",", $useHeaders = false) {
$csvReader = new File_CSV_IteratorReader($fileName, $separator) ;
return $csvReader->toArray($useHeaders);
}
#########################################################
# TODO: Maybe these following function could be used in further workflows
#########################################################
// Replaces all byte sequences that need escaping. Characters that can
// remain unencoded in N-Triples are not touched by the regex. The
// replaced sequences are:
//
// 0x00-0x1F non-printable characters
// 0x22 double quote (")
// 0x5C backslash (\)
// 0x7F non-printable character (Control)
// 0x80-0xBF unexpected continuation byte,
// 0xC0-0xFF first byte of multi-byte character,
// followed by one or more continuation byte (0x80-0xBF)
//
// The regex accepts multi-byte sequences that don't have the correct
// number of continuation bytes (0x80-0xBF). This is handled by the
// callback.
private function escape( $str ) {
return preg_replace_callback(
"/[\\x00-\\x1F\\x22\\x5C\\x7F]|[\\x80-\\xBF]|[\\xC0-\\xFF][\\x80-\\xBF]*/",
array('Transformer','escape_callback'),
$str);
}
private static function escape_callback($matches) {
$encoded_character = $matches[0];
$byte = ord($encoded_character[0]);
// Single-byte characters (0xxxxxxx, hex 00-7E)
if ($byte == 0x09) return "\\t";
if ($byte == 0x0A) return "\\n";
if ($byte == 0x0D) return "\\r";
if ($byte == 0x22) return "\\\"";
if ($byte == 0x5C) return "\\\\";
if ($byte < 0x20 || $byte == 0x7F) {
// encode as \u00XX
return "\\u00" . sprintf("%02X", $byte);
}
// Multi-byte characters
if ($byte < 0xC0) {
// Continuation bytes (0x80-0xBF) are not allowed to appear as first byte
return Transformer::error_character;
}
if ($byte < 0xE0) { // 110xxxxx, hex C0-DF
$bytes = 2;
$codepoint = $byte & 0x1F;
} else if ($byte < 0xF0) {
// 1110xxxx, hex E0-EF
$bytes = 3;
$codepoint = $byte & 0x0F;
} else if ($byte < 0xF8) {
// 11110xxx, hex F0-F7
$bytes = 4;
$codepoint = $byte & 0x07;
} else if ($byte < 0xFC) {
// 111110xx, hex F8-FB
$bytes = 5;
$codepoint = $byte & 0x03;
} else if ($byte < 0xFE) {
// 1111110x, hex FC-FD
$bytes = 6;
$codepoint = $byte & 0x01;
} else {
// 11111110 and 11111111, hex FE-FF, are not allowed
return Transformer::error_character;
}
// Verify correct number of continuation bytes (0x80 to 0xBF)
$length = strlen($encoded_character);
if ($length < $bytes) {
// not enough continuation bytes
return Transformer::error_character;
}
if ($length > $bytes) {
// Too many continuation bytes -- show each as one error
$rest = str_repeat(Transformer::error_character, $length - $bytes);
} else {
$rest = '';
}
// Calculate Unicode codepoints from the bytes
for ($i = 1; $i < $bytes; $i++) {
// Loop over the additional bytes (0x80-0xBF, 10xxxxxx)
// Add their lowest six bits to the end of the codepoint
$byte = ord($encoded_character[$i]);
$codepoint = ($codepoint << 6) | ($byte & 0x3F);
}
// Check for overlong encoding (character is encoded as more bytes than
// necessary, this must be rejected by a safe UTF-8 decoder)
if (($bytes == 2 && $codepoint <= 0x7F) ||
($bytes == 3 && $codepoint <= 0x7FF) ||
($bytes == 4 && $codepoint <= 0xFFFF) ||
($bytes == 5 && $codepoint <= 0x1FFFFF) ||
($bytes == 6 && $codepoint <= 0x3FFFFF)) {
return Transformer::error_character . $rest;
}
// Check for UTF-16 surrogates, which must not be used in UTF-8
if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
return Transformer::error_character . $rest;
}
// Misc. illegal code positions
if ($codepoint == 0xFFFE || $codepoint == 0xFFFF) {
return Transformer::error_character . $rest;
}
if ($codepoint <= 0xFFFF) {
// 0x0100-0xFFFF, encode as \uXXXX
return "\\u" . sprintf("%04X", $codepoint) . $rest;
}
if ($codepoint <= 0x10FFFF) {
// 0x10000-0x10FFFF, encode as \UXXXXXXXX
return "\\U" . sprintf("%08X", $codepoint) . $rest;
}
// Unicode codepoint above 0x10FFFF, no characters have been assigned
// to those codepoints
return Transformer::error_character . $rest;
}
}
?>