In [1]:
import tubes
import json, gzip, glob
In [2]:
FILES = glob.glob("../data/extracted/*.json")
In [3]:
KEYS = (
    ("timestamp", ),
    ("country_code", ),
    ("url", ),
    ("file", "filename"), 
    ("file", "project"), 
    ("details", "installer", "name"),
    ("details", "python"),
    ("details", "system", "name"),
    ("details", "cpu"),
    ("details", "distro", "libc", "lib"),
    ("details", "distro", "libc", "version"),
)
In [4]:
def py_ver():
    result = []
    for file_name in FILES:
        with open(file_name, "rt") as fp:
            for line in fp:
                data = json.loads(line)
                if data.get("country_code") != "GB":
                    continue
                row = []
                for path in KEYS:
                    base = data
                    for part in path:
                        base = base.get(part, None)
                        if base is None:
                            break
                    row.append(base)
                result.append(row)
    return result[-1]
In [5]:
%time py_result = py_ver()
CPU times: user 5min 13s, sys: 3min 29s, total: 8min 43s
Wall time: 8min 43s
In [6]:
def tubes_ver():
    def make_getters(x):
        getters = []
        for path in KEYS:
            base = x
            for part in path:
                base = base.get(part, 'null')
            getters.append(base)
        return tuple(getters)
    return list(tubes.Each(FILES)
        .read_files()
        .split(b'\n')
        .json()
        .skip_unless(lambda x: x.get("country_code", "null").to(bytes).equals("GB"))
        .multi(make_getters)
    )[-1]
In [7]:
%time tubes_result = tubes_ver()
CPU times: user 4.89 s, sys: 2.54 s, total: 7.43 s
Wall time: 7.61 s
In [8]:
tuple(py_result) == tubes_result
Out[8]:
True
In [9]:
 tubes_result
Out[9]:
('2017-12-14 00:50:15 UTC',
 'GB',
 '/packages/f0/a3/bbb4a86cca67fd9db8527caae4221dc14e1900f81cc9caff1ac4724ea838/scp-0.10.2-py2.py3-none-any.whl',
 'scp-0.10.2-py2.py3-none-any.whl',
 'scp',
 'pip',
 '2.7.6',
 'Linux',
 'x86_64',
 'glibc',
 '2.19')
In [10]:
py_result
Out[10]:
['2017-12-14 00:50:15 UTC',
 'GB',
 '/packages/f0/a3/bbb4a86cca67fd9db8527caae4221dc14e1900f81cc9caff1ac4724ea838/scp-0.10.2-py2.py3-none-any.whl',
 'scp-0.10.2-py2.py3-none-any.whl',
 'scp',
 'pip',
 '2.7.6',
 'Linux',
 'x86_64',
 'glibc',
 '2.19']
In [14]:
(8*60) + 43
Out[14]:
523
In [15]:
523/7.43
Out[15]:
70.39030955585464