2 Commits
0.3.2 ... 0.4.0

Author SHA1 Message Date
zynode
70366e3085 parseCSV 0.4 beta
- Error reporting for files/data which is corrupt
  or has formatting errors like using double
  quotes in a field without enclosing quotes. Or
  not escaping double quotes with a second one.

- parse() method does not require input anymore
  if the "$object->file" property has been set.

I'm calling this a beta release due to the heavy
modifications to the core parsing logic required
for error reporting to work. I have tested the
new code quite extensively, I'm fairly confident
that it still parses exactly as it always has.

The second reason I'm calling it a beta release
is cause I'm sure the error reporting code will
need more refinements and tweaks to detect more
types of errors, as it's only picking two types
or syntax errors right now. However, it seems
these two are the most common errors that you
would be likely to come across.

git-svn-id: http://parsecsv-for-php.googlecode.com/svn/trunk@28 339761fc-0c37-0410-822d-8b8cac1f6a97
2008-04-11 18:13:37 +00:00
zynode
2dfd35b988 added error reporting/validation of parsed data, still needs some more testing before release tho...
git-svn-id: http://parsecsv-for-php.googlecode.com/svn/trunk@27 339761fc-0c37-0410-822d-8b8cac1f6a97
2008-04-07 16:17:44 +00:00
3 changed files with 128 additions and 39 deletions

View File

@@ -1,3 +1,32 @@
parseCSV 0.4 beta
-----------------------------------
Date: 11-Apr-2008
- Error reporting for files/data which is corrupt
or has formatting errors like using double
quotes in a field without enclosing quotes. Or
not escaping double quotes with a second one.
- parse() method does not require input anymore
if the "$object->file" property has been set.
I'm calling this a beta release due to the heavy
modifications to the core parsing logic required
for error reporting to work. I have tested the
new code quite extensively, I'm fairly confident
that it still parses exactly as it always has.
The second reason I'm calling it a beta release
is cause I'm sure the error reporting code will
need more refinements and tweaks to detect more
types of errors, as it's only picking two types
or syntax errors right now. However, it seems
these two are the most common errors that you
would be likely to come across.
-----------------------------------
parseCSV 0.3.2 parseCSV 0.3.2
----------------------------------- -----------------------------------
Date: 1-Apr-2008 Date: 1-Apr-2008

View File

@@ -4,7 +4,7 @@ class parseCSV {
/* /*
Class: parseCSV v0.3.2 Class: parseCSV v0.4 beta
http://code.google.com/p/parsecsv-for-php/ http://code.google.com/p/parsecsv-for-php/
@@ -140,6 +140,19 @@ class parseCSV {
# loaded file contents # loaded file contents
var $file_data; var $file_data;
# error while parsing input data
# 0 = No errors found. Everything should be fine :)
# 1 = Hopefully correctable syntax error was found.
# 2 = Enclosure character (double quote by default)
# was found in non-enclosed field. This means
# the file is either corrupt, or does not
# standard CSV formatting. Please validate
# the parsed data yourself.
var $error = 0;
# detailed error info
var $error_info = array();
# array of field values in data parsed # array of field values in data parsed
var $titles = array(); var $titles = array();
@@ -170,6 +183,7 @@ class parseCSV {
* @return nothing * @return nothing
*/ */
function parse ($input = null, $offset = null, $limit = null, $conditions = null) { function parse ($input = null, $offset = null, $limit = null, $conditions = null) {
if ( $input === null ) $input = $this->file;
if ( !empty($input) ) { if ( !empty($input) ) {
if ( $offset !== null ) $this->offset = $offset; if ( $offset !== null ) $this->offset = $offset;
if ( $limit !== null ) $this->limit = $limit; if ( $limit !== null ) $this->limit = $limit;
@@ -272,12 +286,12 @@ class parseCSV {
$pch = ( isset($data{$i-1}) ) ? $data{$i-1} : false ; $pch = ( isset($data{$i-1}) ) ? $data{$i-1} : false ;
// open and closing quotes // open and closing quotes
if ( $ch == $enclosure && (!$enclosed || $nch != $enclosure) ) { if ( $ch == $enclosure ) {
$enclosed = ( $enclosed ) ? false : true ; if ( !$enclosed || $nch != $enclosure ) {
$enclosed = ( $enclosed ) ? false : true ;
// inline quotes } elseif ( $enclosed ) {
} elseif ( $ch == $enclosure && $enclosed ) { $i++;
$i++; }
// end of row // end of row
} elseif ( ($ch == "\n" && $pch != "\r" || $ch == "\r") && !$enclosed ) { } elseif ( ($ch == "\n" && $pch != "\r" || $ch == "\r") && !$enclosed ) {
@@ -311,13 +325,12 @@ class parseCSV {
// capture most probable delimiter // capture most probable delimiter
ksort($filtered); ksort($filtered);
$delimiter = reset($filtered); $this->delimiter = reset($filtered);
$this->delimiter = $delimiter;
// parse data // parse data
if ( $parse ) $this->data = $this->parse_string(); if ( $parse ) $this->data = $this->parse_string();
return $delimiter; return $this->delimiter;
} }
@@ -349,6 +362,8 @@ class parseCSV {
} else return false; } else return false;
} }
$white_spaces = str_replace($this->delimiter, '', " \t\x0B\0");
$rows = array(); $rows = array();
$row = array(); $row = array();
$row_count = 0; $row_count = 0;
@@ -365,22 +380,66 @@ class parseCSV {
$nch = ( isset($data{$i+1}) ) ? $data{$i+1} : false ; $nch = ( isset($data{$i+1}) ) ? $data{$i+1} : false ;
$pch = ( isset($data{$i-1}) ) ? $data{$i-1} : false ; $pch = ( isset($data{$i-1}) ) ? $data{$i-1} : false ;
// open and closing quotes // open/close quotes, and inline quotes
if ( $ch == $this->enclosure && (!$enclosed || $nch != $this->enclosure) ) { if ( $ch == $this->enclosure ) {
$enclosed = ( $enclosed ) ? false : true ; if ( !$enclosed ) {
if ( $enclosed ) $was_enclosed = true; if ( ltrim($current, $white_spaces) == '' ) {
$enclosed = true;
// inline quotes $was_enclosed = true;
} elseif ( $ch == $this->enclosure && $enclosed ) { } else {
$current .= $ch; $this->error = 2;
$i++; $error_row = count($rows) + 1;
$error_col = $col + 1;
if ( !isset($this->error_info[$error_row.'-'.$error_col]) ) {
$this->error_info[$error_row.'-'.$error_col] = array(
'type' => 2,
'info' => 'Syntax error found on row '.$error_row.'. Non-enclosed fields can not contain double-quotes.',
'row' => $error_row,
'field' => $error_col,
'field_name' => (!empty($head[$col])) ? $head[$col] : null,
);
}
$current .= $ch;
}
} elseif ($nch == $this->enclosure) {
$current .= $ch;
$i++;
} elseif ( $nch != $this->delimiter && $nch != "\r" && $nch != "\n" ) {
for ( $x=($i+1); isset($data{$x}) && ltrim($data{$x}, $white_spaces) == ''; $x++ ) {}
if ( $data{$x} == $this->delimiter ) {
$enclosed = false;
$i = $x;
} else {
if ( $this->error < 1 ) {
$this->error = 1;
}
$error_row = count($rows) + 1;
$error_col = $col + 1;
if ( !isset($this->error_info[$error_row.'-'.$error_col]) ) {
$this->error_info[$error_row.'-'.$error_col] = array(
'type' => 1,
'info' =>
'Syntax error found on row '.(count($rows) + 1).'. '.
'A single double-quote was found within an enclosed string. '.
'Enclosed double-quotes must be escaped with a second double-quote.',
'row' => count($rows) + 1,
'field' => $col + 1,
'field_name' => (!empty($head[$col])) ? $head[$col] : null,
);
}
$current .= $ch;
$enclosed = false;
}
} else {
$enclosed = false;
}
// end of field/row // end of field/row
} elseif ( ($ch == $this->delimiter || ($ch == "\n" && $pch != "\r") || $ch == "\r") && !$enclosed ) { } elseif ( ($ch == $this->delimiter || $ch == "\n" || $ch == "\r") && !$enclosed ) {
if ( !$was_enclosed ) $current = trim($current);
$key = ( !empty($head[$col]) ) ? $head[$col] : $col ; $key = ( !empty($head[$col]) ) ? $head[$col] : $col ;
$row[$key] = $current; $row[$key] = ( $was_enclosed ) ? $current : trim($current) ;
$current = ''; $current = '';
$was_enclosed = false;
$col++; $col++;
// end of row // end of row
@@ -405,6 +464,7 @@ class parseCSV {
if ( $this->sort_by === null && $this->limit !== null && count($rows) == $this->limit ) { if ( $this->sort_by === null && $this->limit !== null && count($rows) == $this->limit ) {
$i = $strlen; $i = $strlen;
} }
if ( $ch == "\r" && $nch == "\n" ) $i++;
} }
// append character to current field // append character to current field