Skip to content

Commit

Permalink
fixes issue 891.
Browse files Browse the repository at this point in the history
moved extension folder back to its orginal name.
as far as i can see this didnt break anything because the name should not be hardcoded anywhere...
but keep you eyes open
  • Loading branch information
jbrekle committed Feb 19, 2011
0 parents commit 04b6bbe
Show file tree
Hide file tree
Showing 12 changed files with 1,162 additions and 0 deletions.
206 changes: 206 additions & 0 deletions CsvParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
<?php
require_once 'lib/IteratorReader.php';

/**
* @category component
* @package csvimport
* @author Michael Martin [email protected]
*/
class CsvParser
{
//a constant
const error_character = '\\uFFFD';

/**
* readed csvFile represented ar associated array
* @var array
* @access private
* @author Michael Martin [email protected]
*/
private $csvMap;


/**
* This is the constructor.It try to open the csv file.The method throws an exception
* on failure.
*
* @access public
* @param str $fileName The csv file.
* @author Michael Martin [email protected]
*
* @throws Exception
*/
public function __construct($fileName = "" ) {

//preventing some limitations
ini_set("max_execution_time","600");
ini_set("memory_limit","1536M");
ini_set("auto_detect_line_endings",TRUE);

//initialising some class attributes
$this->csvMap = array();

//parse Map and check status
$this->csvMap = $this->readCSV($fileName);
if( empty ($this->csvMap) )
throw new Exception( 'The file "'.$fileName.'" cannot be readed or is empty.' );
}



/**
* Getter of the CSV Map
*
* @access public
* @return array $csvMap.
*/
public function getParsedFile () {
return $this->csvMap;
}





#########################################################
# Private Functions
#########################################################

/**
* It try to open the csv file.The method throws an exception
*
* @access private
* @param str $fileName The csv file.
*/
private function readCSV($fileName) {

$csvReader = new File_CSV_IteratorReader($fileName, ",") ;
return $csvReader->toArray();
}

#########################################################
# TODO: Maybe these following function could be used in further workflows
#########################################################

// Replaces all byte sequences that need escaping. Characters that can
// remain unencoded in N-Triples are not touched by the regex. The
// replaced sequences are:
//
// 0x00-0x1F non-printable characters
// 0x22 double quote (")
// 0x5C backslash (\)
// 0x7F non-printable character (Control)
// 0x80-0xBF unexpected continuation byte,
// 0xC0-0xFF first byte of multi-byte character,
// followed by one or more continuation byte (0x80-0xBF)
//
// The regex accepts multi-byte sequences that don't have the correct
// number of continuation bytes (0x80-0xBF). This is handled by the
// callback.
private function escape( $str ) {
return preg_replace_callback(
"/[\\x00-\\x1F\\x22\\x5C\\x7F]|[\\x80-\\xBF]|[\\xC0-\\xFF][\\x80-\\xBF]*/",
array('Transformer','escape_callback'),
$str);
}

private static function escape_callback($matches) {
$encoded_character = $matches[0];
$byte = ord($encoded_character[0]);
// Single-byte characters (0xxxxxxx, hex 00-7E)
if ($byte == 0x09) return "\\t";
if ($byte == 0x0A) return "\\n";
if ($byte == 0x0D) return "\\r";
if ($byte == 0x22) return "\\\"";
if ($byte == 0x5C) return "\\\\";
if ($byte < 0x20 || $byte == 0x7F) {
// encode as \u00XX
return "\\u00" . sprintf("%02X", $byte);
}

// Multi-byte characters
if ($byte < 0xC0) {
// Continuation bytes (0x80-0xBF) are not allowed to appear as first byte
return Transformer::error_character;
}
if ($byte < 0xE0) { // 110xxxxx, hex C0-DF
$bytes = 2;
$codepoint = $byte & 0x1F;
} else if ($byte < 0xF0) {
// 1110xxxx, hex E0-EF
$bytes = 3;
$codepoint = $byte & 0x0F;
} else if ($byte < 0xF8) {
// 11110xxx, hex F0-F7
$bytes = 4;
$codepoint = $byte & 0x07;
} else if ($byte < 0xFC) {
// 111110xx, hex F8-FB
$bytes = 5;
$codepoint = $byte & 0x03;
} else if ($byte < 0xFE) {
// 1111110x, hex FC-FD
$bytes = 6;
$codepoint = $byte & 0x01;
} else {
// 11111110 and 11111111, hex FE-FF, are not allowed
return Transformer::error_character;
}

// Verify correct number of continuation bytes (0x80 to 0xBF)
$length = strlen($encoded_character);
if ($length < $bytes) {
// not enough continuation bytes
return Transformer::error_character;
}

if ($length > $bytes) {
// Too many continuation bytes -- show each as one error
$rest = str_repeat(Transformer::error_character, $length - $bytes);
} else {
$rest = '';
}

// Calculate Unicode codepoints from the bytes
for ($i = 1; $i < $bytes; $i++) {
// Loop over the additional bytes (0x80-0xBF, 10xxxxxx)
// Add their lowest six bits to the end of the codepoint
$byte = ord($encoded_character[$i]);
$codepoint = ($codepoint << 6) | ($byte & 0x3F);
}

// Check for overlong encoding (character is encoded as more bytes than
// necessary, this must be rejected by a safe UTF-8 decoder)
if (($bytes == 2 && $codepoint <= 0x7F) ||
($bytes == 3 && $codepoint <= 0x7FF) ||
($bytes == 4 && $codepoint <= 0xFFFF) ||
($bytes == 5 && $codepoint <= 0x1FFFFF) ||
($bytes == 6 && $codepoint <= 0x3FFFFF)) {
return Transformer::error_character . $rest;
}

// Check for UTF-16 surrogates, which must not be used in UTF-8
if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
return Transformer::error_character . $rest;
}

// Misc. illegal code positions
if ($codepoint == 0xFFFE || $codepoint == 0xFFFF) {
return Transformer::error_character . $rest;
}

if ($codepoint <= 0xFFFF) {
// 0x0100-0xFFFF, encode as \uXXXX
return "\\u" . sprintf("%04X", $codepoint) . $rest;
}

if ($codepoint <= 0x10FFFF) {
// 0x10000-0x10FFFF, encode as \UXXXXXXXX
return "\\U" . sprintf("%08X", $codepoint) . $rest;
}
// Unicode codepoint above 0x10FFFF, no characters have been assigned
// to those codepoints
return Transformer::error_character . $rest;
}
}
?>
Loading

0 comments on commit 04b6bbe

Please sign in to comment.