In [1]:
import tubes
import json, gzip, glob
In [2]:
FILES = glob.glob("../data/extracted/*.json")
In [3]:
KEYS = (
    ("timestamp", ),
    ("country_code", ),
    ("url", ),
    ("file", "filename"), 
    ("file", "project"), 
    ("details", "installer", "name"),
    ("details", "python"),
    ("details", "system"),
    ("details", "system", "name"),
    ("details", "cpu"),
    ("details", "distro", "libc", "lib"),
    ("details", "distro", "libc", "version"),
)
In [4]:
def py_ver():
    for file_name in FILES:
        with open(file_name, "rt") as fp:
            for line in fp:
                data = json.loads(line)
                row = []
                for path in KEYS:
                    base = data
                    for part in path:
                        base = base.get(part, None)
                        if base is None:
                            break
                    row.append(base)
                result = row
    return result
In [5]:
%time py_result = py_ver()
CPU times: user 5min 49s, sys: 5.27 s, total: 5min 55s
Wall time: 5min 55s
In [6]:
def tubes_ver():
    def make_getters(x):
        getters = []
        for path in KEYS:
            base = x
            for part in path:
                base = base.get(part, 'null')
            getters.append(base)
        return tuple(getters)
    for row in (tubes.Each(FILES)
        .read_files()
        .split(b'\n')
        .json()
        .multi(make_getters)
    ):
        result = row
    return result
In [7]:
%time tubes_result = tubes_ver()
CPU times: user 1min 16s, sys: 10.7 s, total: 1min 27s
Wall time: 1min 31s
In [8]:
tuple(py_result) == tubes_result
Out[8]:
True
In [9]:
 tubes_result
Out[9]:
('2017-12-14 00:57:11 UTC',
 'US',
 '/packages/67/4b/141a581104b1f6397bfa78ac9d43d8ad29a7ca43ea90a2d863fe3056e86a/six-1.11.0-py2.py3-none-any.whl',
 'six-1.11.0-py2.py3-none-any.whl',
 'six',
 'pip',
 '2.7.13',
 {'name': 'Windows', 'release': '10'},
 'Windows',
 'AMD64',
 None,
 None)