# chunked access from multiple threads.  on my machine, the optimal number
# of threads appears to be 2 * number of CPU:s.

import re, sys
from collections import defaultdict
import pprocess

# 1: 2.7 seconds
# 2: 2.5 seconds
# 3: 2.1 seconds
# 4: 2.0 seconds
# 5: 1.9 seconds
# 6: 1.9 seconds

def process(file, chunk):
    f = open(file)
    f.seek(chunk[0])
    d = defaultdict(int)
    for page in pat.findall(f.read(chunk[1])):
        d[page] += 1
    return d

def getchunks(file, size=1024*1024):
    # yield sequence of (start, size) chunk descriptors
    f = open(file)
    while 1:
        start = f.tell()
        f.seek(size, 1)
        s = f.readline() # skip forward to next line ending
        yield start, f.tell() - start
        if not s:
            break

# --------------------------------------------------------------------

import time, sys
if sys.platform == "win32":
    timer = time.clock
else:
    timer = time.time

t0, t1 = timer(), time.clock()

pat = re.compile(r"GET /ongoing/When/\d\d\dx/(\d\d\d\d/\d\d/\d\d/[^ .]+) ")

FILE = "o1000k.ap"

try:
    count = int(sys.argv[1])
except:
    count = 2

result = pprocess.Queue(limit=count, reuse=1)
process = result.manage(pprocess.MakeReusable(process))

for chunk in getchunks(FILE, 200000000 / count): # override the default, based on the file size
    process(FILE, chunk)

count = defaultdict(int)
for item in result:
    for key, value in item.items():
        count[key] += value

for key in sorted(count, key=count.get)[:10]:
    pass # print "%40s = %s" % (key, count[key])

print timer() - t0, time.clock() - t1

# --------------------------------------------------------------------

for key in sorted(count, key=count.get)[-10:]:
    print "%40s = %s" % (key, count[key])
