fixes issue 891.

moved extension folder back to its orginal name. as far as i can see this didnt break anything because the name should not be hardcoded anywhere... but keep you eyes open
AKSW · Feb 19, 2011 · 04b6bbe · 04b6bbe
commit 04b6bbe
Show file tree

Hide file tree

Showing 12 changed files with 1,162 additions and 0 deletions.
diff --git a/CsvParser.php b/CsvParser.php
@@ -0,0 +1,206 @@
+<?php
+require_once 'lib/IteratorReader.php';
+
+/**
+ *  @category		component
+ *  @package        csvimport		
+ *  @author			Michael Martin [email protected]
+ */
+class CsvParser
+{
+    //a constant
+    const error_character = '\\uFFFD';
+
+	/**
+	 *	readed csvFile represented ar associated array
+	 *	@var array
+	 *  @access private
+     *  @author			Michael Martin [email protected]
+	 */
+    private $csvMap;
+
+
+	/**
+	 * This is the constructor.It try to open the csv file.The method throws an exception
+	 * on failure.
+	 *
+	 * @access public
+	 * @param str $fileName The csv file.
+     * @author			Michael Martin [email protected]
+	 *
+	 * @throws Exception
+	 */
+    public function __construct($fileName = "" ) {
+
+        //preventing some limitations
+        ini_set("max_execution_time","600");
+        ini_set("memory_limit","1536M");
+        ini_set("auto_detect_line_endings",TRUE);
+
+        //initialising some class attributes
+        $this->csvMap = array();
+
+        //parse Map and check status        
+        $this->csvMap = $this->readCSV($fileName);
+		if( empty ($this->csvMap) )
+			throw new Exception( 'The file "'.$fileName.'" cannot be readed or is empty.' );
+    }
+
+
+
+	/**
+	 * Getter of the CSV Map
+	 *
+	 * @access public
+	 * @return array $csvMap.
+	 */
+    public function getParsedFile () {
+        return $this->csvMap;
+    }
+
+
+
+
+
+#########################################################
+# Private Functions
+#########################################################
+
+	/**
+	 * It try to open the csv file.The method throws an exception
+	 *
+	 * @access private
+	 * @param str $fileName The csv file.
+	 */
+    private function readCSV($fileName) {
+
+        $csvReader = new File_CSV_IteratorReader($fileName, ",") ;
+        return $csvReader->toArray();
+    }
+
+#########################################################
+# TODO: Maybe these following function could be used in further workflows
+#########################################################
+
+    // Replaces all byte sequences that need escaping. Characters that can
+    // remain unencoded in N-Triples are not touched by the regex. The
+    // replaced sequences are:
+    //
+    // 0x00-0x1F   non-printable characters
+    // 0x22        double quote (")
+    // 0x5C        backslash (\)
+    // 0x7F        non-printable character (Control)
+    // 0x80-0xBF   unexpected continuation byte,
+    // 0xC0-0xFF   first byte of multi-byte character,
+    //             followed by one or more continuation byte (0x80-0xBF)
+    //
+    // The regex accepts multi-byte sequences that don't have the correct
+    // number of continuation bytes (0x80-0xBF). This is handled by the
+    // callback.
+    private function escape( $str ) {
+        return preg_replace_callback(
+            "/[\\x00-\\x1F\\x22\\x5C\\x7F]|[\\x80-\\xBF]|[\\xC0-\\xFF][\\x80-\\xBF]*/",
+            array('Transformer','escape_callback'),
+            $str);
+    }
+
+    private static function escape_callback($matches) {
+        $encoded_character = $matches[0];
+        $byte = ord($encoded_character[0]);
+        // Single-byte characters (0xxxxxxx, hex 00-7E)
+        if ($byte == 0x09) return "\\t";
+        if ($byte == 0x0A) return "\\n";
+        if ($byte == 0x0D) return "\\r";
+        if ($byte == 0x22) return "\\\"";
+        if ($byte == 0x5C) return "\\\\";
+        if ($byte < 0x20 || $byte == 0x7F) {
+            // encode as \u00XX
+            return "\\u00" . sprintf("%02X", $byte);
+        }
+
+        // Multi-byte characters
+        if ($byte < 0xC0) {
+            // Continuation bytes (0x80-0xBF) are not allowed to appear as first byte
+            return Transformer::error_character;
+        }
+        if ($byte < 0xE0) { // 110xxxxx, hex C0-DF
+            $bytes = 2;
+            $codepoint = $byte & 0x1F;
+        } else if ($byte < 0xF0) { 
+            // 1110xxxx, hex E0-EF
+            $bytes = 3;
+            $codepoint = $byte & 0x0F;
+        } else if ($byte < 0xF8) { 
+            // 11110xxx, hex F0-F7
+            $bytes = 4;
+            $codepoint = $byte & 0x07;
+        } else if ($byte < 0xFC) { 
+            // 111110xx, hex F8-FB
+            $bytes = 5;
+            $codepoint = $byte & 0x03;
+        } else if ($byte < 0xFE) { 
+            // 1111110x, hex FC-FD
+            $bytes = 6;
+            $codepoint = $byte & 0x01;
+        } else { 
+            // 11111110 and 11111111, hex FE-FF, are not allowed
+            return Transformer::error_character;
+        }
+
+        // Verify correct number of continuation bytes (0x80 to 0xBF)
+        $length = strlen($encoded_character);
+        if ($length < $bytes) { 
+            // not enough continuation bytes
+            return Transformer::error_character;
+        }
+
+        if ($length > $bytes) { 
+            // Too many continuation bytes -- show each as one error
+            $rest = str_repeat(Transformer::error_character, $length - $bytes);
+        } else {
+            $rest = '';
+        }
+
+        // Calculate Unicode codepoints from the bytes
+        for ($i = 1; $i < $bytes; $i++) {
+            // Loop over the additional bytes (0x80-0xBF, 10xxxxxx)
+            // Add their lowest six bits to the end of the codepoint
+            $byte = ord($encoded_character[$i]);
+            $codepoint = ($codepoint << 6) | ($byte & 0x3F);
+        }
+
+        // Check for overlong encoding (character is encoded as more bytes than
+        // necessary, this must be rejected by a safe UTF-8 decoder)
+        if (($bytes == 2 && $codepoint <= 0x7F) ||
+            ($bytes == 3 && $codepoint <= 0x7FF) ||
+            ($bytes == 4 && $codepoint <= 0xFFFF) ||
+            ($bytes == 5 && $codepoint <= 0x1FFFFF) ||
+            ($bytes == 6 && $codepoint <= 0x3FFFFF)) {
+            return Transformer::error_character . $rest;
+        }
+
+        // Check for UTF-16 surrogates, which must not be used in UTF-8
+        if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
+            return Transformer::error_character . $rest;
+        }
+
+        // Misc. illegal code positions
+        if ($codepoint == 0xFFFE || $codepoint == 0xFFFF) {
+            return Transformer::error_character . $rest;
+        }
+
+        if ($codepoint <= 0xFFFF) {
+            // 0x0100-0xFFFF, encode as \uXXXX
+            return "\\u" . sprintf("%04X", $codepoint) . $rest;
+        }
+
+        if ($codepoint <= 0x10FFFF) {
+            // 0x10000-0x10FFFF, encode as \UXXXXXXXX
+            return "\\U" . sprintf("%08X", $codepoint) . $rest;
+        }
+        // Unicode codepoint above 0x10FFFF, no characters have been assigned
+        // to those codepoints
+        return Transformer::error_character . $rest;
+    }
+}
+?>