This commit is contained in:
Titivillus 2023-10-21 14:58:24 +00:00
commit 073c5e28fa
3 changed files with 204 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
*.jsonl*
*.swp
res.*
*@*

198
colligere/analizer.py Normal file
View File

@ -0,0 +1,198 @@
import time
import json
import shutil
import random
import argparse
import pandas as pd
from pathlib import Path
from datetime import timedelta
from uniplot import plot_to_string
class Analizer:
def __init__(self, jsonl, sample_len, keys, ky, debug=False, quiet=True):
self.start_time = time.process_time()
self.jsonl = jsonl
self.sample_len = sample_len
self.ky = ky
self.keys = keys + [self.ky]
self.debug = debug
self.quiet = quiet
self.term_size = shutil.get_terminal_size()
self.keys_list = "__keys.list"
self.main_key = self.ky if self.ky else self.keys_list
self.axes = {"x": [], "y": []}
self.all_vals = {self.main_key: {}}
self.curr_item = 0
self.sample(analize=True)
if not self.debug and not self.quiet:
print("Extractor finished\nTotal time: " + self.__elapsed_time())
def sample(self, analize=False):
"""
NOTE: this is a refactor version of https://gitlab.com/aapjeisbaas/shuf
"""
line, lines, count = None, [], self.__count()
bytes_per_line, total_lines_est = count[0], count[1]
fh = open(self.jsonl)
while self.curr_item < self.sample_len:
try:
linestart = random.randint(0, total_lines_est)
readstart = (linestart * bytes_per_line) - bytes_per_line
if readstart < bytes_per_line:
readstart = 0
else:
fh.seek(readstart)
fh.readline()
line = json.loads(fh.readline())
if line and line not in lines:
lines.append(line)
self.curr_item += 1
self.__analize(line) if analize else self.__prompt(line)
except UnicodeError:
pass
except Exception:
if self.debug:
raise
else:
pass
fh.close()
return lines
def __count(self):
count, bytes_read, block_size = 0, 0, 4096
jsonl_size = self.jsonl.stat().st_size
f = open(self.jsonl, "rt")
while count < 10000:
buffer = f.read(block_size)
if not buffer:
break
count += buffer.count("\n")
bytes_read += block_size
f.close()
if bytes_read > jsonl_size:
bytes_read = jsonl_size
bytes_per_line = bytes_read / count
return (bytes_per_line, int(jsonl_size / bytes_per_line))
def __analize(self, line):
title = "metadata.record.title"
df = pd.json_normalize(line, sep=".")
flat = df.to_dict(orient="records")[0]
msg = flat[title].strip() if title in flat.keys() else ""
for key, val in flat.items():
stripped = str(val).strip() if val else ""
self.__append(self.keys_list, key)
if key in self.keys and stripped:
self.__append(key, stripped)
self.axes["x"].append(self.curr_item)
self.axes["y"].append(len(self.all_vals[self.main_key]))
self.__write()
self.__draw(msg)
def __append(self, key, val):
self.all_vals.setdefault(key, {})
self.all_vals[key].setdefault(val, {"len": 0, "freq": 0})
self.all_vals[key][val]["len"] += 1
self.__update_freq()
def __update_freq(self):
for root_key in self.all_vals.keys():
for key, val in self.all_vals[root_key].items():
freq = round(val["len"] / self.curr_item, 5)
self.all_vals[root_key][key]["freq"] = freq
def __write(self):
for key, val in self.all_vals.items():
file = Path("res." + key + ".csv")
df = pd.DataFrame(val).sort_index(ascending=False).T
if "len" in df:
df["len"] = df["len"].astype(int)
df.to_csv(file, index_label="key")
def __prompt(self, msg, stdout=True, extra=""):
timer = self.__elapsed_time()
process = int((self.curr_item * 100) / self.sample_len)
item = "{:,}".format(self.curr_item)
prompt = "[{}][{}%][{}i]".format(timer, process, item)
prompt += f"{extra} {msg}"
prompt = self.__fix_width(prompt)
if stdout and not self.quiet:
print(prompt, end="\r")
return prompt
def __draw(self, msg):
if self.ky and self.ky not in self.all_vals.keys():
return
xs, ys = self.axes["x"], self.axes["y"]
options = {"height": 10, "x_unit": "i", "y_unit": "k", "lines": True}
graph = plot_to_string(ys, xs, **options)
if not self.debug and not self.quiet:
lines = self.term_size.lines
extra = "[{}k]".format("{:,}".format(self.axes["y"][-1]))
prompt = self.__prompt(msg, stdout=False, extra=extra)
print("\n".join(["" for _ in range(lines)]))
graph.insert(0, prompt)
[graph.append("") for _ in range(lines - len(graph) - 1)]
print("\n".join(map(lambda line: self.__fix_width(line), graph)))
def __fix_width(self, line):
cols = self.term_size.columns
if len(line) < cols:
line += " " * (cols - len(line))
elif len(line) > cols:
line = line[0:cols]
return line
def __elapsed_time(self):
curr_time = timedelta(seconds=time.process_time() - self.start_time)
return str(curr_time).split(".")[0]
if __name__ == "__main__":
def get_args():
parser = argparse.ArgumentParser(
prog="Extractor",
description="Extracts keys and values from JSON lines.",
)
parser.add_argument("jsonl", type=Path, help="JSON lines file")
parser.add_argument(
"-i",
"--items",
type=int,
default=1000,
dest="sample_len",
help="sample items size; 1000 by default",
)
parser.add_argument(
"-k",
"--key",
action="append",
dest="keys",
metavar="KEY",
help="key value to extract",
)
parser.add_argument(
"-ky",
"--key-axis",
default=None,
dest="ky",
metavar="KEY",
help="like -k, but its values are used for y-axis",
)
parser.add_argument(
"--debug",
action="store_true",
default=False,
help="stop on exception for debugging",
)
parser.add_argument(
"--quiet",
action="store_true",
default=False,
help="avoid prints; ignored by --debug",
)
return parser.parse_args()
Analizer(**vars(get_args()))

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
pandas
uniplot