Post

Efficient CSV Processing in C++ Using Memory Mapping

Efficient CSV Processing in C++ Using Memory Mapping

In this code sample project, I exemplify the utilization of mmap in C++ for reading a CSV file. Given a CSV file and a specified column index, I extract and record the values of the column into a distinct file with the suffix “_col”. The implementation is outlined below:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <err.h>   /* GNU C lib error messages: err */
#include <errno.h> /* Std C libary system error numbers: errno */
#include <fcntl.h> /* C POSIX libary file control options */
#include <stdio.h>
#include <sys/mman.h> /* memory management declarations: mmap, munmap */
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h> /* C POSIX libary system calls: open, close */

#include <fstream>
#include <set>
#include <string>

std::string extract_column(const char *fname, int col,
                           int (*call_back)(const char *, const char *, std::set<std::string> &, int)) {
    std::set<std::string> col_data;
    int fd = open(fname, O_RDONLY);
    struct stat fs;
    char *buf, *buf_end;
    char *begin, *end, c;

    if (fd == -1) {
        err(1, "open mmap: %s", fname);
        return {};
    }

    if (fstat(fd, &fs) == -1) {
        err(1, "stat:%s", fname);
        return {};
    }

    /* fs.st_size could have been 0 actually */
    buf = reinterpret_cast<char *>(mmap(0, fs.st_size, PROT_READ, MAP_SHARED, fd, 0));
    if (buf == (void *)-1) {
        err(1, "mmap: %s", fname);
        close(fd);
        return {};
    }

    std::string col_fname = std::string(fname) + std::to_string(col) + "_col";
    std::ofstream output_file(col_fname);
    std::ostream_iterator<std::string> output_iterator(output_file, "\n");

    buf_end = buf + fs.st_size;
    begin = end = buf;
    while (1) {
        if (!(*end == '\r' || *end == '\n')) {
            if (++end < buf_end) {
                continue;
            }
        } else if (1 + end < buf_end) {
            /* see if we got "\r\n" or "\n\r" here */
            c = *(1 + end);
            if ((c == '\r' || c == '\n') && c != *end) {
                ++end;
            }
        }

        /* Call the call back and check error indication. Announce error
        here, because we didn't tell call_back the file name */
        if (!(*call_back)(begin, end, col_data, col)) {
            err(1, "[callback] %s", fname);
            break;
        }

        if ((begin = ++end) >= buf_end) {
            break;
        }

        if (col_data.size() == 2) {
            // Flush to disk
            std::copy(col_data.begin(), col_data.end(), output_iterator);
            col_data.clear();
        }
    }
    std::copy(col_data.begin(), col_data.end(), output_iterator);

    munmap(buf, fs.st_size);
    close(fd);
    return col_fname;
}

int column_val_callback(const char *begin, const char *end, std::set<std::string> &col_data, int col) {
    std::string line(begin, end - begin + 1);
    int current_col = 0;
    std::string::size_type last_pos = 0;
    std::string::size_type pos = 0;
    while ((pos = line.find(',', pos)) != std::string::npos) {
        if (current_col == col) {
            col_data.emplace(line.substr(last_pos, pos - last_pos));
        }
        ++pos;
        last_pos = pos;
        ++current_col;
    }
    return 1;
}

int main(int argc, char **argv) {
    std::string fname = "test.csv";
    int col = 1;
    std::string col_fname = extract_column(fname.c_str(), col, &column_val_callback);
    if (col_fname.empty()) {
        err(1, "Column %d for %s appears to be empty.", col, fname.c_str());
    }
}
This post is licensed under CC BY 4.0 by the author.