-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
moved extension folder back to its orginal name. as far as i can see this didnt break anything because the name should not be hardcoded anywhere... but keep you eyes open
- Loading branch information
0 parents
commit 04b6bbe
Showing
12 changed files
with
1,162 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
<?php | ||
require_once 'lib/IteratorReader.php'; | ||
|
||
/** | ||
* @category component | ||
* @package csvimport | ||
* @author Michael Martin [email protected] | ||
*/ | ||
class CsvParser | ||
{ | ||
//a constant | ||
const error_character = '\\uFFFD'; | ||
|
||
/** | ||
* readed csvFile represented ar associated array | ||
* @var array | ||
* @access private | ||
* @author Michael Martin [email protected] | ||
*/ | ||
private $csvMap; | ||
|
||
|
||
/** | ||
* This is the constructor.It try to open the csv file.The method throws an exception | ||
* on failure. | ||
* | ||
* @access public | ||
* @param str $fileName The csv file. | ||
* @author Michael Martin [email protected] | ||
* | ||
* @throws Exception | ||
*/ | ||
public function __construct($fileName = "" ) { | ||
|
||
//preventing some limitations | ||
ini_set("max_execution_time","600"); | ||
ini_set("memory_limit","1536M"); | ||
ini_set("auto_detect_line_endings",TRUE); | ||
|
||
//initialising some class attributes | ||
$this->csvMap = array(); | ||
|
||
//parse Map and check status | ||
$this->csvMap = $this->readCSV($fileName); | ||
if( empty ($this->csvMap) ) | ||
throw new Exception( 'The file "'.$fileName.'" cannot be readed or is empty.' ); | ||
} | ||
|
||
|
||
|
||
/** | ||
* Getter of the CSV Map | ||
* | ||
* @access public | ||
* @return array $csvMap. | ||
*/ | ||
public function getParsedFile () { | ||
return $this->csvMap; | ||
} | ||
|
||
|
||
|
||
|
||
|
||
######################################################### | ||
# Private Functions | ||
######################################################### | ||
|
||
/** | ||
* It try to open the csv file.The method throws an exception | ||
* | ||
* @access private | ||
* @param str $fileName The csv file. | ||
*/ | ||
private function readCSV($fileName) { | ||
|
||
$csvReader = new File_CSV_IteratorReader($fileName, ",") ; | ||
return $csvReader->toArray(); | ||
} | ||
|
||
######################################################### | ||
# TODO: Maybe these following function could be used in further workflows | ||
######################################################### | ||
|
||
// Replaces all byte sequences that need escaping. Characters that can | ||
// remain unencoded in N-Triples are not touched by the regex. The | ||
// replaced sequences are: | ||
// | ||
// 0x00-0x1F non-printable characters | ||
// 0x22 double quote (") | ||
// 0x5C backslash (\) | ||
// 0x7F non-printable character (Control) | ||
// 0x80-0xBF unexpected continuation byte, | ||
// 0xC0-0xFF first byte of multi-byte character, | ||
// followed by one or more continuation byte (0x80-0xBF) | ||
// | ||
// The regex accepts multi-byte sequences that don't have the correct | ||
// number of continuation bytes (0x80-0xBF). This is handled by the | ||
// callback. | ||
private function escape( $str ) { | ||
return preg_replace_callback( | ||
"/[\\x00-\\x1F\\x22\\x5C\\x7F]|[\\x80-\\xBF]|[\\xC0-\\xFF][\\x80-\\xBF]*/", | ||
array('Transformer','escape_callback'), | ||
$str); | ||
} | ||
|
||
private static function escape_callback($matches) { | ||
$encoded_character = $matches[0]; | ||
$byte = ord($encoded_character[0]); | ||
// Single-byte characters (0xxxxxxx, hex 00-7E) | ||
if ($byte == 0x09) return "\\t"; | ||
if ($byte == 0x0A) return "\\n"; | ||
if ($byte == 0x0D) return "\\r"; | ||
if ($byte == 0x22) return "\\\""; | ||
if ($byte == 0x5C) return "\\\\"; | ||
if ($byte < 0x20 || $byte == 0x7F) { | ||
// encode as \u00XX | ||
return "\\u00" . sprintf("%02X", $byte); | ||
} | ||
|
||
// Multi-byte characters | ||
if ($byte < 0xC0) { | ||
// Continuation bytes (0x80-0xBF) are not allowed to appear as first byte | ||
return Transformer::error_character; | ||
} | ||
if ($byte < 0xE0) { // 110xxxxx, hex C0-DF | ||
$bytes = 2; | ||
$codepoint = $byte & 0x1F; | ||
} else if ($byte < 0xF0) { | ||
// 1110xxxx, hex E0-EF | ||
$bytes = 3; | ||
$codepoint = $byte & 0x0F; | ||
} else if ($byte < 0xF8) { | ||
// 11110xxx, hex F0-F7 | ||
$bytes = 4; | ||
$codepoint = $byte & 0x07; | ||
} else if ($byte < 0xFC) { | ||
// 111110xx, hex F8-FB | ||
$bytes = 5; | ||
$codepoint = $byte & 0x03; | ||
} else if ($byte < 0xFE) { | ||
// 1111110x, hex FC-FD | ||
$bytes = 6; | ||
$codepoint = $byte & 0x01; | ||
} else { | ||
// 11111110 and 11111111, hex FE-FF, are not allowed | ||
return Transformer::error_character; | ||
} | ||
|
||
// Verify correct number of continuation bytes (0x80 to 0xBF) | ||
$length = strlen($encoded_character); | ||
if ($length < $bytes) { | ||
// not enough continuation bytes | ||
return Transformer::error_character; | ||
} | ||
|
||
if ($length > $bytes) { | ||
// Too many continuation bytes -- show each as one error | ||
$rest = str_repeat(Transformer::error_character, $length - $bytes); | ||
} else { | ||
$rest = ''; | ||
} | ||
|
||
// Calculate Unicode codepoints from the bytes | ||
for ($i = 1; $i < $bytes; $i++) { | ||
// Loop over the additional bytes (0x80-0xBF, 10xxxxxx) | ||
// Add their lowest six bits to the end of the codepoint | ||
$byte = ord($encoded_character[$i]); | ||
$codepoint = ($codepoint << 6) | ($byte & 0x3F); | ||
} | ||
|
||
// Check for overlong encoding (character is encoded as more bytes than | ||
// necessary, this must be rejected by a safe UTF-8 decoder) | ||
if (($bytes == 2 && $codepoint <= 0x7F) || | ||
($bytes == 3 && $codepoint <= 0x7FF) || | ||
($bytes == 4 && $codepoint <= 0xFFFF) || | ||
($bytes == 5 && $codepoint <= 0x1FFFFF) || | ||
($bytes == 6 && $codepoint <= 0x3FFFFF)) { | ||
return Transformer::error_character . $rest; | ||
} | ||
|
||
// Check for UTF-16 surrogates, which must not be used in UTF-8 | ||
if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { | ||
return Transformer::error_character . $rest; | ||
} | ||
|
||
// Misc. illegal code positions | ||
if ($codepoint == 0xFFFE || $codepoint == 0xFFFF) { | ||
return Transformer::error_character . $rest; | ||
} | ||
|
||
if ($codepoint <= 0xFFFF) { | ||
// 0x0100-0xFFFF, encode as \uXXXX | ||
return "\\u" . sprintf("%04X", $codepoint) . $rest; | ||
} | ||
|
||
if ($codepoint <= 0x10FFFF) { | ||
// 0x10000-0x10FFFF, encode as \UXXXXXXXX | ||
return "\\U" . sprintf("%08X", $codepoint) . $rest; | ||
} | ||
// Unicode codepoint above 0x10FFFF, no characters have been assigned | ||
// to those codepoints | ||
return Transformer::error_character . $rest; | ||
} | ||
} | ||
?> |
Oops, something went wrong.