Read large text files quickly

2

I have a big problem reading / parsing large text files. The issue is the following are files that have many lines, about 150k lines and I have to read them and then parsed the information. I do it by means of a fread, reading ALL the file and then I parse it in memory with sscanf. The file format is relatively basic:

[HEADER]
Depth Measurement Unit = ft
Curve Description = 
Curve Measurement Unit = None
Source = imported
Module = ascii-import
[POINTS]
0.1000 78.4980
0.2000 78.6448
0.3000 78.7916
0.4000 79.1416
0.5000 79.6948
0.6000 80.2480
0.7000 80.3952
0.8000 80.5424
... siguen las lineas hasta aprox. 150mil y a veces mas.

To read it, I use the following code:

FILE *file;
char *buffer;
unsigned long fileLen;

//Open file
file = fopen(qPrintable(fullpathname), "rb");
if (!file)
{
    return;
}

//Get file length
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);

//Allocate memory
buffer=(char *)malloc(fileLen+1);
memset(buffer, '
void LabCurveSA::loadPoints()
{
QString fullpathname = this->m_sPath + this->m_sName + WELL_CURVE_EXTENSION;

FILE *file;
char *buffer;
unsigned long fileLen;

//Open file
file = fopen(qPrintable(fullpathname), "rb");
if (!file)
{
    return;
}

//Get file length
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);

//Allocate memory
buffer=(char *)malloc(fileLen+1);
memset(buffer, '
[HEADER]
Depth Measurement Unit = ft
Curve Description = 
Curve Measurement Unit = None
Source = imported
Module = ascii-import
[POINTS]
0.1000 78.4980
0.2000 78.6448
0.3000 78.7916
0.4000 79.1416
0.5000 79.6948
0.6000 80.2480
0.7000 80.3952
0.8000 80.5424
... siguen las lineas hasta aprox. 150mil y a veces mas.
', fileLen + 1); if (!buffer) { fclose(file); return; } //Read file contents into buffer fread(buffer, fileLen, 1, file); fclose(file); //Do what ever with buffer const char *ptr = buffer; char field [256] = "
FILE *file;
char *buffer;
unsigned long fileLen;

//Open file
file = fopen(qPrintable(fullpathname), "rb");
if (!file)
{
    return;
}

//Get file length
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);

//Allocate memory
buffer=(char *)malloc(fileLen+1);
memset(buffer, '
void LabCurveSA::loadPoints()
{
QString fullpathname = this->m_sPath + this->m_sName + WELL_CURVE_EXTENSION;

FILE *file;
char *buffer;
unsigned long fileLen;

//Open file
file = fopen(qPrintable(fullpathname), "rb");
if (!file)
{
    return;
}

//Get file length
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);

//Allocate memory
buffer=(char *)malloc(fileLen+1);
memset(buffer, '%pre%', fileLen + 1);
if (!buffer)
{
    fclose(file);
    return;
}

//Read file contents into buffer
fread(buffer, fileLen, 1, file);
fclose(file);

//Do what ever with buffer
const char *ptr = buffer;
char field [256] = "%pre%";
int n;
bool found = false;
char strstart[11] = "[POINTS]\r%pre%";
while ( sscanf(ptr, "%255[^\n]%n", field, &n) == 1 && !found)
{
    if (!found)
    {
        if (strcmp(field, strstart) == 0)
        {
            found = true;
        }
    }

    ptr += n;
    if ( *ptr != '\n' )
    {
        break;
    }
    ++ptr;
    memset(field, 0, 256);
}

if (found)
{
    int x = 0;
    bool val_ant_null = true;
    while ( sscanf(ptr, "%255[^\n]%n", field, &n) == 1)
    {
        const char *subptr = field;
        char parts[128] = "%pre%";
        int y;

        double depth, sample;
        QPointF p;

        bool firstnum = true, secondnum = true;
        if (sscanf(subptr, "%127[^ \t]%n", parts, &y) == 1)
        {
            try{
                depth = atof(parts);
            }
            catch(char *str)
            {
                Q_UNUSED(str);
                depth = 0;
                firstnum = false;
            }
            subptr +=y;
            ++subptr;
        }
        memset(parts, 0, 128);
        if (sscanf(subptr, "%127[^ \t]%n", parts, &y) == 1)
        {
            try{
                sample = atof(parts);
            }
            catch(char *str)
            {
                Q_UNUSED(str);
                sample = 0;
                secondnum = false;
            }
        }
        memset(parts, 0, 128);

        if (firstnum && secondnum)
        {
            if (dcomparer(depth, DEqual, 0.0, 4) && x != 0)
            {
                ptr += n;
                if ( *ptr != '\n' )
                {
                    break;
                }
                ++ptr;
                memset(field, 0, 256);
                continue;
            }

            if (x == 0 && val_ant_null && sample >= NULL_POINT)
            {
                ptr += n;
                if ( *ptr != '\n' )
                {
                    break;
                }
                ++ptr;
                memset(field, 0, 256);
                continue;
            }
            else
            {
                if (x != 0)
                {
                    val_ant_null = sample >= NULL_POINT;
                    if (val_ant_null)
                        x = -1;
                }
            }

            if (sample < NULL_POINT)
            {
                if (sample > m_dMax)
                    m_dMax = sample;

                if (sample < m_dMin)
                    m_dMin = sample;
            }

            p.setX(sample);
            p.setY(depth);

            this->m_arPoints.append(p);
            x++;
        }

        ptr += n;
        if ( *ptr != '\n' )
        {
            break;
        }
        ++ptr;
        memset(field, 0, 256);
    }

    if (this->m_arPoints.count() > 0)
    {
        //if (this->m_arPoints.at(0).x() >= NULL_POINT)
        while(this->m_arPoints.at(0).x() >= NULL_POINT)
            this->m_arPoints.remove(0);
    }

    if (this->m_arPoints.count() > 0)
    {
        if (this->m_arPoints.at(this->m_arPoints.count() - 1).x() >= NULL_POINT)
            this->m_arPoints.remove(this->m_arPoints.count() - 1);
    }
}

free(buffer);
}
', fileLen + 1); if (!buffer) { fclose(file); return; } //Read file contents into buffer fread(buffer, fileLen, 1, file); fclose(file);
"; int n; bool found = false; char strstart[11] = "[POINTS]\r%pre%"; while ( sscanf(ptr, "%255[^\n]%n", field, &n) == 1 && !found) { if (!found) { if (strcmp(field, strstart) == 0) { found = true; } } ptr += n; if ( *ptr != '\n' ) { break; } ++ptr; memset(field, 0, 256); } if (found) { int x = 0; bool val_ant_null = true; while ( sscanf(ptr, "%255[^\n]%n", field, &n) == 1) { const char *subptr = field; char parts[128] = "%pre%"; int y; double depth, sample; QPointF p; bool firstnum = true, secondnum = true; if (sscanf(subptr, "%127[^ \t]%n", parts, &y) == 1) { try{ depth = atof(parts); } catch(char *str) { Q_UNUSED(str); depth = 0; firstnum = false; } subptr +=y; ++subptr; } memset(parts, 0, 128); if (sscanf(subptr, "%127[^ \t]%n", parts, &y) == 1) { try{ sample = atof(parts); } catch(char *str) { Q_UNUSED(str); sample = 0; secondnum = false; } } memset(parts, 0, 128); if (firstnum && secondnum) { if (dcomparer(depth, DEqual, 0.0, 4) && x != 0) { ptr += n; if ( *ptr != '\n' ) { break; } ++ptr; memset(field, 0, 256); continue; } if (x == 0 && val_ant_null && sample >= NULL_POINT) { ptr += n; if ( *ptr != '\n' ) { break; } ++ptr; memset(field, 0, 256); continue; } else { if (x != 0) { val_ant_null = sample >= NULL_POINT; if (val_ant_null) x = -1; } } if (sample < NULL_POINT) { if (sample > m_dMax) m_dMax = sample; if (sample < m_dMin) m_dMin = sample; } p.setX(sample); p.setY(depth); this->m_arPoints.append(p); x++; } ptr += n; if ( *ptr != '\n' ) { break; } ++ptr; memset(field, 0, 256); } if (this->m_arPoints.count() > 0) { //if (this->m_arPoints.at(0).x() >= NULL_POINT) while(this->m_arPoints.at(0).x() >= NULL_POINT) this->m_arPoints.remove(0); } if (this->m_arPoints.count() > 0) { if (this->m_arPoints.at(this->m_arPoints.count() - 1).x() >= NULL_POINT) this->m_arPoints.remove(this->m_arPoints.count() - 1); } } free(buffer); }
', fileLen + 1); if (!buffer) { fclose(file); return; } //Read file contents into buffer fread(buffer, fileLen, 1, file); fclose(file);

and then parsea, reading the lines and transforming into numbers both columns of the POINTS section of the file. This is the complete code of reading and parsing:

%pre%

This process takes a lot ... what alternatives are there? ... someone told me that it can be saved as a binary inside an xml and that it would increase the speed of reading. Can someone give me an idea of how I can accelerate this reading and / or writing process? Thanks

    
asked by Emiliano Torres 18.04.2017 в 15:13
source

1 answer

1
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);

Based on the documentation of fseek we have the following:

  

Binary streams are not required to support SEEK_END, in particular if additional null bytes are output.

That is, for binary readings SEEK_END does not have to be supported and, consequently, it may malfunction.

If, on the other hand, the file were text SEEK_END is not a good idea either, unless you move in the ASCII world you will find characters that will occupy more than 2 characters and the final reservation you make will be incorrect.

Well, then, how to read large files? Sometimes the simplest is what works best. To me personally this system has not given me bad results:

std::istream in;
in.open("fichero");
auto ss = std::ostringstream();
ss << in.rdbuf();
std::string contenido = ss.str();

If the files were already large to the point of absurdity (several gigas) it is advisable to choose to read the file in fragments and store them in a container type deque :

const size_t tamFragmento = static_cast<std::size_t>(BUFSIZ);
std::array<char,tamFragmento> fragmento;
std::deque<char> contenido;

while( in.read(fragmento.data(),fragmento.size()) || in.gcount() )
{
  contenido.insert(contenido.end(),
                   fragmento.begin(),
                   fragmento.begin() + in.gcount());
}

The use of BUFSIZ is used because in theory this macro indicates an adequate value to work efficiently with the input mechanisms output.

A fragment of the documentation about it:

  

The value of this macro is an integer constant expression that is good to use for the size argument to setvbuf. This value is guaranteed to be at least 256.

     

The value of BUFSIZ is chosen on each system so as to make stream I / O efficient. So it is a good idea to use BUFSIZ as the size for the buffer when you call setvbuf.

     

Actually, you can get an even better value to use for the buffer size by means of the fstat system call: it is found in the st_blksize field of the file attributes. See Attribute Meanings.

     

Sometimes people also use BUFFER as the allocation size of buffers used for related purposes, such as strings used to receive a line of input with fgets (see Character Input). There is no particular reason to use BUFSIZ for this instead of any other integer, except that it might lead to doing I / O in chunks of an efficient size.

Since this is intended for very large files, it does not make sense to store them later in a single string since it is likely that the system can not find a hole large enough ... processing this type of files will require some logic additional since a record can be found in more than one fragment ... in this life nothing is free.

Edit:

Talking about your program now. I have created for the occasion a file similar to yours and I have tried to parsed it with the following code:

struct Point
{
  double x, y;

  Point(double x, double y)
    : x(x), y(y)
  { }
};

int main()
{
  std::ifstream in;

  in.open("../untitled/datos.txt");

  std::string line;
  while( in.good() )
  {
    std::getline(in,line);
    if( line == "[POINTS]" ) break;
  }

  std::list<Point> lista;

  while( in.good() )
  {
    double x, y;
    in >> x >> y;

    lista.push_back(Point(x,y));
  }

  std::cout << lista.size();
}

The program has parse 150,502 points in less than a second. Then the performance problem is more due to the management you make of the data than the reading of the file itself.

    
answered by 18.04.2017 / 15:42
source