I have a big problem reading / parsing large text files. The issue is the following are files that have many lines, about 150k lines and I have to read them and then parsed the information. I do it by means of a fread, reading ALL the file and then I parse it in memory with sscanf. The file format is relatively basic:
[HEADER]
Depth Measurement Unit = ft
Curve Description =
Curve Measurement Unit = None
Source = imported
Module = ascii-import
[POINTS]
0.1000 78.4980
0.2000 78.6448
0.3000 78.7916
0.4000 79.1416
0.5000 79.6948
0.6000 80.2480
0.7000 80.3952
0.8000 80.5424
... siguen las lineas hasta aprox. 150mil y a veces mas.
To read it, I use the following code:
FILE *file;
char *buffer;
unsigned long fileLen;
//Open file
file = fopen(qPrintable(fullpathname), "rb");
if (!file)
{
return;
}
//Get file length
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
//Allocate memory
buffer=(char *)malloc(fileLen+1);
memset(buffer, 'void LabCurveSA::loadPoints()
{
QString fullpathname = this->m_sPath + this->m_sName + WELL_CURVE_EXTENSION;
FILE *file;
char *buffer;
unsigned long fileLen;
//Open file
file = fopen(qPrintable(fullpathname), "rb");
if (!file)
{
return;
}
//Get file length
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
//Allocate memory
buffer=(char *)malloc(fileLen+1);
memset(buffer, '[HEADER]
Depth Measurement Unit = ft
Curve Description =
Curve Measurement Unit = None
Source = imported
Module = ascii-import
[POINTS]
0.1000 78.4980
0.2000 78.6448
0.3000 78.7916
0.4000 79.1416
0.5000 79.6948
0.6000 80.2480
0.7000 80.3952
0.8000 80.5424
... siguen las lineas hasta aprox. 150mil y a veces mas.
', fileLen + 1);
if (!buffer)
{
fclose(file);
return;
}
//Read file contents into buffer
fread(buffer, fileLen, 1, file);
fclose(file);
//Do what ever with buffer
const char *ptr = buffer;
char field [256] = "FILE *file;
char *buffer;
unsigned long fileLen;
//Open file
file = fopen(qPrintable(fullpathname), "rb");
if (!file)
{
return;
}
//Get file length
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
//Allocate memory
buffer=(char *)malloc(fileLen+1);
memset(buffer, 'void LabCurveSA::loadPoints()
{
QString fullpathname = this->m_sPath + this->m_sName + WELL_CURVE_EXTENSION;
FILE *file;
char *buffer;
unsigned long fileLen;
//Open file
file = fopen(qPrintable(fullpathname), "rb");
if (!file)
{
return;
}
//Get file length
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
//Allocate memory
buffer=(char *)malloc(fileLen+1);
memset(buffer, '%pre%', fileLen + 1);
if (!buffer)
{
fclose(file);
return;
}
//Read file contents into buffer
fread(buffer, fileLen, 1, file);
fclose(file);
//Do what ever with buffer
const char *ptr = buffer;
char field [256] = "%pre%";
int n;
bool found = false;
char strstart[11] = "[POINTS]\r%pre%";
while ( sscanf(ptr, "%255[^\n]%n", field, &n) == 1 && !found)
{
if (!found)
{
if (strcmp(field, strstart) == 0)
{
found = true;
}
}
ptr += n;
if ( *ptr != '\n' )
{
break;
}
++ptr;
memset(field, 0, 256);
}
if (found)
{
int x = 0;
bool val_ant_null = true;
while ( sscanf(ptr, "%255[^\n]%n", field, &n) == 1)
{
const char *subptr = field;
char parts[128] = "%pre%";
int y;
double depth, sample;
QPointF p;
bool firstnum = true, secondnum = true;
if (sscanf(subptr, "%127[^ \t]%n", parts, &y) == 1)
{
try{
depth = atof(parts);
}
catch(char *str)
{
Q_UNUSED(str);
depth = 0;
firstnum = false;
}
subptr +=y;
++subptr;
}
memset(parts, 0, 128);
if (sscanf(subptr, "%127[^ \t]%n", parts, &y) == 1)
{
try{
sample = atof(parts);
}
catch(char *str)
{
Q_UNUSED(str);
sample = 0;
secondnum = false;
}
}
memset(parts, 0, 128);
if (firstnum && secondnum)
{
if (dcomparer(depth, DEqual, 0.0, 4) && x != 0)
{
ptr += n;
if ( *ptr != '\n' )
{
break;
}
++ptr;
memset(field, 0, 256);
continue;
}
if (x == 0 && val_ant_null && sample >= NULL_POINT)
{
ptr += n;
if ( *ptr != '\n' )
{
break;
}
++ptr;
memset(field, 0, 256);
continue;
}
else
{
if (x != 0)
{
val_ant_null = sample >= NULL_POINT;
if (val_ant_null)
x = -1;
}
}
if (sample < NULL_POINT)
{
if (sample > m_dMax)
m_dMax = sample;
if (sample < m_dMin)
m_dMin = sample;
}
p.setX(sample);
p.setY(depth);
this->m_arPoints.append(p);
x++;
}
ptr += n;
if ( *ptr != '\n' )
{
break;
}
++ptr;
memset(field, 0, 256);
}
if (this->m_arPoints.count() > 0)
{
//if (this->m_arPoints.at(0).x() >= NULL_POINT)
while(this->m_arPoints.at(0).x() >= NULL_POINT)
this->m_arPoints.remove(0);
}
if (this->m_arPoints.count() > 0)
{
if (this->m_arPoints.at(this->m_arPoints.count() - 1).x() >= NULL_POINT)
this->m_arPoints.remove(this->m_arPoints.count() - 1);
}
}
free(buffer);
}
', fileLen + 1);
if (!buffer)
{
fclose(file);
return;
}
//Read file contents into buffer
fread(buffer, fileLen, 1, file);
fclose(file);
";
int n;
bool found = false;
char strstart[11] = "[POINTS]\r%pre%";
while ( sscanf(ptr, "%255[^\n]%n", field, &n) == 1 && !found)
{
if (!found)
{
if (strcmp(field, strstart) == 0)
{
found = true;
}
}
ptr += n;
if ( *ptr != '\n' )
{
break;
}
++ptr;
memset(field, 0, 256);
}
if (found)
{
int x = 0;
bool val_ant_null = true;
while ( sscanf(ptr, "%255[^\n]%n", field, &n) == 1)
{
const char *subptr = field;
char parts[128] = "%pre%";
int y;
double depth, sample;
QPointF p;
bool firstnum = true, secondnum = true;
if (sscanf(subptr, "%127[^ \t]%n", parts, &y) == 1)
{
try{
depth = atof(parts);
}
catch(char *str)
{
Q_UNUSED(str);
depth = 0;
firstnum = false;
}
subptr +=y;
++subptr;
}
memset(parts, 0, 128);
if (sscanf(subptr, "%127[^ \t]%n", parts, &y) == 1)
{
try{
sample = atof(parts);
}
catch(char *str)
{
Q_UNUSED(str);
sample = 0;
secondnum = false;
}
}
memset(parts, 0, 128);
if (firstnum && secondnum)
{
if (dcomparer(depth, DEqual, 0.0, 4) && x != 0)
{
ptr += n;
if ( *ptr != '\n' )
{
break;
}
++ptr;
memset(field, 0, 256);
continue;
}
if (x == 0 && val_ant_null && sample >= NULL_POINT)
{
ptr += n;
if ( *ptr != '\n' )
{
break;
}
++ptr;
memset(field, 0, 256);
continue;
}
else
{
if (x != 0)
{
val_ant_null = sample >= NULL_POINT;
if (val_ant_null)
x = -1;
}
}
if (sample < NULL_POINT)
{
if (sample > m_dMax)
m_dMax = sample;
if (sample < m_dMin)
m_dMin = sample;
}
p.setX(sample);
p.setY(depth);
this->m_arPoints.append(p);
x++;
}
ptr += n;
if ( *ptr != '\n' )
{
break;
}
++ptr;
memset(field, 0, 256);
}
if (this->m_arPoints.count() > 0)
{
//if (this->m_arPoints.at(0).x() >= NULL_POINT)
while(this->m_arPoints.at(0).x() >= NULL_POINT)
this->m_arPoints.remove(0);
}
if (this->m_arPoints.count() > 0)
{
if (this->m_arPoints.at(this->m_arPoints.count() - 1).x() >= NULL_POINT)
this->m_arPoints.remove(this->m_arPoints.count() - 1);
}
}
free(buffer);
}
', fileLen + 1);
if (!buffer)
{
fclose(file);
return;
}
//Read file contents into buffer
fread(buffer, fileLen, 1, file);
fclose(file);
and then parsea, reading the lines and transforming into numbers both columns of the POINTS section of the file. This is the complete code of reading and parsing:
%pre%This process takes a lot ... what alternatives are there? ... someone told me that it can be saved as a binary inside an xml and that it would increase the speed of reading. Can someone give me an idea of how I can accelerate this reading and / or writing process? Thanks