Efficient CSV Processing in C++ Using Memory Mapping
Efficient CSV Processing in C++ Using Memory Mapping
In this code sample project, I exemplify the utilization of mmap
in C++ for reading a CSV file. Given a CSV file and a specified column index, I extract and record the values of the column into a distinct file with the suffix “_col”. The implementation is outlined below:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <err.h> /* GNU C lib error messages: err */
#include <errno.h> /* Std C libary system error numbers: errno */
#include <fcntl.h> /* C POSIX libary file control options */
#include <stdio.h>
#include <sys/mman.h> /* memory management declarations: mmap, munmap */
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h> /* C POSIX libary system calls: open, close */
#include <fstream>
#include <set>
#include <string>
std::string extract_column(const char *fname, int col,
int (*call_back)(const char *, const char *, std::set<std::string> &, int)) {
std::set<std::string> col_data;
int fd = open(fname, O_RDONLY);
struct stat fs;
char *buf, *buf_end;
char *begin, *end, c;
if (fd == -1) {
err(1, "open mmap: %s", fname);
return {};
}
if (fstat(fd, &fs) == -1) {
err(1, "stat:%s", fname);
return {};
}
/* fs.st_size could have been 0 actually */
buf = reinterpret_cast<char *>(mmap(0, fs.st_size, PROT_READ, MAP_SHARED, fd, 0));
if (buf == (void *)-1) {
err(1, "mmap: %s", fname);
close(fd);
return {};
}
std::string col_fname = std::string(fname) + std::to_string(col) + "_col";
std::ofstream output_file(col_fname);
std::ostream_iterator<std::string> output_iterator(output_file, "\n");
buf_end = buf + fs.st_size;
begin = end = buf;
while (1) {
if (!(*end == '\r' || *end == '\n')) {
if (++end < buf_end) {
continue;
}
} else if (1 + end < buf_end) {
/* see if we got "\r\n" or "\n\r" here */
c = *(1 + end);
if ((c == '\r' || c == '\n') && c != *end) {
++end;
}
}
/* Call the call back and check error indication. Announce error
here, because we didn't tell call_back the file name */
if (!(*call_back)(begin, end, col_data, col)) {
err(1, "[callback] %s", fname);
break;
}
if ((begin = ++end) >= buf_end) {
break;
}
if (col_data.size() == 2) {
// Flush to disk
std::copy(col_data.begin(), col_data.end(), output_iterator);
col_data.clear();
}
}
std::copy(col_data.begin(), col_data.end(), output_iterator);
munmap(buf, fs.st_size);
close(fd);
return col_fname;
}
int column_val_callback(const char *begin, const char *end, std::set<std::string> &col_data, int col) {
std::string line(begin, end - begin + 1);
int current_col = 0;
std::string::size_type last_pos = 0;
std::string::size_type pos = 0;
while ((pos = line.find(',', pos)) != std::string::npos) {
if (current_col == col) {
col_data.emplace(line.substr(last_pos, pos - last_pos));
}
++pos;
last_pos = pos;
++current_col;
}
return 1;
}
int main(int argc, char **argv) {
std::string fname = "test.csv";
int col = 1;
std::string col_fname = extract_column(fname.c_str(), col, &column_val_callback);
if (col_fname.empty()) {
err(1, "Column %d for %s appears to be empty.", col, fname.c_str());
}
}
This post is licensed under CC BY 4.0 by the author.