commit 073c5e28fa7655fe504dfdefaa6bd6fa81149c13 Author: Titivillus Date: Sat Oct 21 14:58:24 2023 +0000 Init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d1e75e7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.jsonl* +*.swp +res.* +*@* diff --git a/colligere/analizer.py b/colligere/analizer.py new file mode 100644 index 0000000..77f3ece --- /dev/null +++ b/colligere/analizer.py @@ -0,0 +1,198 @@ +import time +import json +import shutil +import random +import argparse +import pandas as pd +from pathlib import Path +from datetime import timedelta +from uniplot import plot_to_string + + +class Analizer: + def __init__(self, jsonl, sample_len, keys, ky, debug=False, quiet=True): + self.start_time = time.process_time() + self.jsonl = jsonl + self.sample_len = sample_len + self.ky = ky + self.keys = keys + [self.ky] + self.debug = debug + self.quiet = quiet + self.term_size = shutil.get_terminal_size() + self.keys_list = "__keys.list" + self.main_key = self.ky if self.ky else self.keys_list + self.axes = {"x": [], "y": []} + self.all_vals = {self.main_key: {}} + self.curr_item = 0 + self.sample(analize=True) + if not self.debug and not self.quiet: + print("Extractor finished\nTotal time: " + self.__elapsed_time()) + + def sample(self, analize=False): + """ + NOTE: this is a refactor version of https://gitlab.com/aapjeisbaas/shuf + """ + line, lines, count = None, [], self.__count() + bytes_per_line, total_lines_est = count[0], count[1] + fh = open(self.jsonl) + while self.curr_item < self.sample_len: + try: + linestart = random.randint(0, total_lines_est) + readstart = (linestart * bytes_per_line) - bytes_per_line + if readstart < bytes_per_line: + readstart = 0 + else: + fh.seek(readstart) + fh.readline() + line = json.loads(fh.readline()) + if line and line not in lines: + lines.append(line) + self.curr_item += 1 + self.__analize(line) if analize else self.__prompt(line) + except UnicodeError: + pass + except Exception: + if self.debug: + raise + else: + pass + fh.close() + return lines + + def __count(self): + count, bytes_read, block_size = 0, 0, 4096 + jsonl_size = self.jsonl.stat().st_size + f = open(self.jsonl, "rt") + while count < 10000: + buffer = f.read(block_size) + if not buffer: + break + count += buffer.count("\n") + bytes_read += block_size + f.close() + if bytes_read > jsonl_size: + bytes_read = jsonl_size + bytes_per_line = bytes_read / count + return (bytes_per_line, int(jsonl_size / bytes_per_line)) + + def __analize(self, line): + title = "metadata.record.title" + df = pd.json_normalize(line, sep=".") + flat = df.to_dict(orient="records")[0] + msg = flat[title].strip() if title in flat.keys() else "" + for key, val in flat.items(): + stripped = str(val).strip() if val else "" + self.__append(self.keys_list, key) + if key in self.keys and stripped: + self.__append(key, stripped) + self.axes["x"].append(self.curr_item) + self.axes["y"].append(len(self.all_vals[self.main_key])) + self.__write() + self.__draw(msg) + + def __append(self, key, val): + self.all_vals.setdefault(key, {}) + self.all_vals[key].setdefault(val, {"len": 0, "freq": 0}) + self.all_vals[key][val]["len"] += 1 + self.__update_freq() + + def __update_freq(self): + for root_key in self.all_vals.keys(): + for key, val in self.all_vals[root_key].items(): + freq = round(val["len"] / self.curr_item, 5) + self.all_vals[root_key][key]["freq"] = freq + + def __write(self): + for key, val in self.all_vals.items(): + file = Path("res." + key + ".csv") + df = pd.DataFrame(val).sort_index(ascending=False).T + if "len" in df: + df["len"] = df["len"].astype(int) + df.to_csv(file, index_label="key") + + def __prompt(self, msg, stdout=True, extra=""): + timer = self.__elapsed_time() + process = int((self.curr_item * 100) / self.sample_len) + item = "{:,}".format(self.curr_item) + prompt = "[{}][{}%][{}i]".format(timer, process, item) + prompt += f"{extra} {msg}" + prompt = self.__fix_width(prompt) + if stdout and not self.quiet: + print(prompt, end="\r") + return prompt + + def __draw(self, msg): + if self.ky and self.ky not in self.all_vals.keys(): + return + xs, ys = self.axes["x"], self.axes["y"] + options = {"height": 10, "x_unit": "i", "y_unit": "k", "lines": True} + graph = plot_to_string(ys, xs, **options) + if not self.debug and not self.quiet: + lines = self.term_size.lines + extra = "[{}k]".format("{:,}".format(self.axes["y"][-1])) + prompt = self.__prompt(msg, stdout=False, extra=extra) + print("\n".join(["" for _ in range(lines)])) + graph.insert(0, prompt) + [graph.append("") for _ in range(lines - len(graph) - 1)] + print("\n".join(map(lambda line: self.__fix_width(line), graph))) + + def __fix_width(self, line): + cols = self.term_size.columns + if len(line) < cols: + line += " " * (cols - len(line)) + elif len(line) > cols: + line = line[0:cols] + return line + + def __elapsed_time(self): + curr_time = timedelta(seconds=time.process_time() - self.start_time) + return str(curr_time).split(".")[0] + + +if __name__ == "__main__": + + def get_args(): + parser = argparse.ArgumentParser( + prog="Extractor", + description="Extracts keys and values from JSON lines.", + ) + parser.add_argument("jsonl", type=Path, help="JSON lines file") + parser.add_argument( + "-i", + "--items", + type=int, + default=1000, + dest="sample_len", + help="sample items size; 1000 by default", + ) + parser.add_argument( + "-k", + "--key", + action="append", + dest="keys", + metavar="KEY", + help="key value to extract", + ) + parser.add_argument( + "-ky", + "--key-axis", + default=None, + dest="ky", + metavar="KEY", + help="like -k, but its values are used for y-axis", + ) + parser.add_argument( + "--debug", + action="store_true", + default=False, + help="stop on exception for debugging", + ) + parser.add_argument( + "--quiet", + action="store_true", + default=False, + help="avoid prints; ignored by --debug", + ) + return parser.parse_args() + + Analizer(**vars(get_args())) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fd09353 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pandas +uniplot