Init
This commit is contained in:
commit
073c5e28fa
|
@ -0,0 +1,4 @@
|
|||
*.jsonl*
|
||||
*.swp
|
||||
res.*
|
||||
*@*
|
|
@ -0,0 +1,198 @@
|
|||
import time
|
||||
import json
|
||||
import shutil
|
||||
import random
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from datetime import timedelta
|
||||
from uniplot import plot_to_string
|
||||
|
||||
|
||||
class Analizer:
|
||||
def __init__(self, jsonl, sample_len, keys, ky, debug=False, quiet=True):
|
||||
self.start_time = time.process_time()
|
||||
self.jsonl = jsonl
|
||||
self.sample_len = sample_len
|
||||
self.ky = ky
|
||||
self.keys = keys + [self.ky]
|
||||
self.debug = debug
|
||||
self.quiet = quiet
|
||||
self.term_size = shutil.get_terminal_size()
|
||||
self.keys_list = "__keys.list"
|
||||
self.main_key = self.ky if self.ky else self.keys_list
|
||||
self.axes = {"x": [], "y": []}
|
||||
self.all_vals = {self.main_key: {}}
|
||||
self.curr_item = 0
|
||||
self.sample(analize=True)
|
||||
if not self.debug and not self.quiet:
|
||||
print("Extractor finished\nTotal time: " + self.__elapsed_time())
|
||||
|
||||
def sample(self, analize=False):
|
||||
"""
|
||||
NOTE: this is a refactor version of https://gitlab.com/aapjeisbaas/shuf
|
||||
"""
|
||||
line, lines, count = None, [], self.__count()
|
||||
bytes_per_line, total_lines_est = count[0], count[1]
|
||||
fh = open(self.jsonl)
|
||||
while self.curr_item < self.sample_len:
|
||||
try:
|
||||
linestart = random.randint(0, total_lines_est)
|
||||
readstart = (linestart * bytes_per_line) - bytes_per_line
|
||||
if readstart < bytes_per_line:
|
||||
readstart = 0
|
||||
else:
|
||||
fh.seek(readstart)
|
||||
fh.readline()
|
||||
line = json.loads(fh.readline())
|
||||
if line and line not in lines:
|
||||
lines.append(line)
|
||||
self.curr_item += 1
|
||||
self.__analize(line) if analize else self.__prompt(line)
|
||||
except UnicodeError:
|
||||
pass
|
||||
except Exception:
|
||||
if self.debug:
|
||||
raise
|
||||
else:
|
||||
pass
|
||||
fh.close()
|
||||
return lines
|
||||
|
||||
def __count(self):
|
||||
count, bytes_read, block_size = 0, 0, 4096
|
||||
jsonl_size = self.jsonl.stat().st_size
|
||||
f = open(self.jsonl, "rt")
|
||||
while count < 10000:
|
||||
buffer = f.read(block_size)
|
||||
if not buffer:
|
||||
break
|
||||
count += buffer.count("\n")
|
||||
bytes_read += block_size
|
||||
f.close()
|
||||
if bytes_read > jsonl_size:
|
||||
bytes_read = jsonl_size
|
||||
bytes_per_line = bytes_read / count
|
||||
return (bytes_per_line, int(jsonl_size / bytes_per_line))
|
||||
|
||||
def __analize(self, line):
|
||||
title = "metadata.record.title"
|
||||
df = pd.json_normalize(line, sep=".")
|
||||
flat = df.to_dict(orient="records")[0]
|
||||
msg = flat[title].strip() if title in flat.keys() else ""
|
||||
for key, val in flat.items():
|
||||
stripped = str(val).strip() if val else ""
|
||||
self.__append(self.keys_list, key)
|
||||
if key in self.keys and stripped:
|
||||
self.__append(key, stripped)
|
||||
self.axes["x"].append(self.curr_item)
|
||||
self.axes["y"].append(len(self.all_vals[self.main_key]))
|
||||
self.__write()
|
||||
self.__draw(msg)
|
||||
|
||||
def __append(self, key, val):
|
||||
self.all_vals.setdefault(key, {})
|
||||
self.all_vals[key].setdefault(val, {"len": 0, "freq": 0})
|
||||
self.all_vals[key][val]["len"] += 1
|
||||
self.__update_freq()
|
||||
|
||||
def __update_freq(self):
|
||||
for root_key in self.all_vals.keys():
|
||||
for key, val in self.all_vals[root_key].items():
|
||||
freq = round(val["len"] / self.curr_item, 5)
|
||||
self.all_vals[root_key][key]["freq"] = freq
|
||||
|
||||
def __write(self):
|
||||
for key, val in self.all_vals.items():
|
||||
file = Path("res." + key + ".csv")
|
||||
df = pd.DataFrame(val).sort_index(ascending=False).T
|
||||
if "len" in df:
|
||||
df["len"] = df["len"].astype(int)
|
||||
df.to_csv(file, index_label="key")
|
||||
|
||||
def __prompt(self, msg, stdout=True, extra=""):
|
||||
timer = self.__elapsed_time()
|
||||
process = int((self.curr_item * 100) / self.sample_len)
|
||||
item = "{:,}".format(self.curr_item)
|
||||
prompt = "[{}][{}%][{}i]".format(timer, process, item)
|
||||
prompt += f"{extra} {msg}"
|
||||
prompt = self.__fix_width(prompt)
|
||||
if stdout and not self.quiet:
|
||||
print(prompt, end="\r")
|
||||
return prompt
|
||||
|
||||
def __draw(self, msg):
|
||||
if self.ky and self.ky not in self.all_vals.keys():
|
||||
return
|
||||
xs, ys = self.axes["x"], self.axes["y"]
|
||||
options = {"height": 10, "x_unit": "i", "y_unit": "k", "lines": True}
|
||||
graph = plot_to_string(ys, xs, **options)
|
||||
if not self.debug and not self.quiet:
|
||||
lines = self.term_size.lines
|
||||
extra = "[{}k]".format("{:,}".format(self.axes["y"][-1]))
|
||||
prompt = self.__prompt(msg, stdout=False, extra=extra)
|
||||
print("\n".join(["" for _ in range(lines)]))
|
||||
graph.insert(0, prompt)
|
||||
[graph.append("") for _ in range(lines - len(graph) - 1)]
|
||||
print("\n".join(map(lambda line: self.__fix_width(line), graph)))
|
||||
|
||||
def __fix_width(self, line):
|
||||
cols = self.term_size.columns
|
||||
if len(line) < cols:
|
||||
line += " " * (cols - len(line))
|
||||
elif len(line) > cols:
|
||||
line = line[0:cols]
|
||||
return line
|
||||
|
||||
def __elapsed_time(self):
|
||||
curr_time = timedelta(seconds=time.process_time() - self.start_time)
|
||||
return str(curr_time).split(".")[0]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="Extractor",
|
||||
description="Extracts keys and values from JSON lines.",
|
||||
)
|
||||
parser.add_argument("jsonl", type=Path, help="JSON lines file")
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--items",
|
||||
type=int,
|
||||
default=1000,
|
||||
dest="sample_len",
|
||||
help="sample items size; 1000 by default",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--key",
|
||||
action="append",
|
||||
dest="keys",
|
||||
metavar="KEY",
|
||||
help="key value to extract",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-ky",
|
||||
"--key-axis",
|
||||
default=None,
|
||||
dest="ky",
|
||||
metavar="KEY",
|
||||
help="like -k, but its values are used for y-axis",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="stop on exception for debugging",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quiet",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="avoid prints; ignored by --debug",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
Analizer(**vars(get_args()))
|
|
@ -0,0 +1,2 @@
|
|||
pandas
|
||||
uniplot
|
Reference in New Issue