commit 285e112232dabf7491857721062ab5947eb3f22a Author: Titivillus Date: Fri Apr 12 21:37:29 2024 +0000 Init diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c83bca8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +registered_books_annas_archive_es.jsonl filter=lfs diff=lfs merge=lfs -text +unregistered_books_annas_archive_es.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4ecea04 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +annas_archive.jsonl +annas_archive_es.jsonl diff --git a/keys_list.csv b/keys_list.csv new file mode 100644 index 0000000..0ceb956 --- /dev/null +++ b/keys_list.csv @@ -0,0 +1,68 @@ +key,len,freq +aacid,243,1.0 +metadata.oclc_number,243,1.0 +metadata.type,243,1.0 +metadata.from_filenames,243,1.0 +metadata.record.oclcNumber,243,1.0 +metadata.record.title,243,1.0 +metadata.record.creator,243,1.0 +metadata.record.generalFormat,243,1.0 +metadata.record.specificFormat,243,1.0 +metadata.record.edition,243,1.0 +metadata.record.totalEditions,243,1.0 +metadata.record.publisher,243,1.0 +metadata.record.publicationPlace,226,0.93004 +metadata.record.publicationDate,243,1.0 +metadata.record.catalogingLanguage,243,1.0 +metadata.record.summary,76,0.31276 +metadata.record.physicalDescription,203,0.83539 +metadata.record.series,203,0.83539 +metadata.record.castNotes,203,0.83539 +metadata.record.languageNotes,203,0.83539 +metadata.record.subjectsText,203,0.83539 +metadata.record.cartographicData,203,0.83539 +metadata.record.dissertationInfo,196,0.80658 +metadata.record.performerNotes,203,0.83539 +metadata.record.genre,203,0.83539 +metadata.record.numericDesignation,203,0.83539 +metadata.record.audience,203,0.83539 +metadata.record.generalNotes,203,0.83539 +metadata.record.creditNotes,203,0.83539 +metadata.record.contentNotes,194,0.79835 +metadata.record.reproductionNotes,203,0.83539 +metadata.record.eventNotes,203,0.83539 +metadata.record.doi,203,0.83539 +metadata.record.peerReviewed,243,1.0 +metadata.record.mediumOfPerformance,203,0.83539 +metadata.record.issns,203,0.83539 +metadata.record.additionalPhysicalFormEntries,203,0.83539 +metadata.record.digitalAccessAndLocations,203,0.83539 +metadata.record.digitalObjectInfo,242,0.99588 +metadata.record.abstract,243,1.0 +metadata.record.evaluativeContent,203,0.83539 +metadata.record.otherFormats,243,1.0 +metadata.record.isbns,243,1.0 +metadata.record.isbn13,243,1.0 +metadata.record.openAccessLinks,217,0.893 +metadata.record.publication,243,1.0 +metadata.record.sourceIssn,203,0.83539 +metadata.record.sourceIsbns,203,0.83539 +metadata.record.contributors,243,1.0 +metadata.record.titleInfo.text,189,0.77778 +metadata.record.publisherName.text,169,0.69547 +metadata.record.machineReadableDate,42,0.17284 +metadata.record.seriesVolumes,42,0.17284 +metadata.record.subjects,40,0.16461 +metadata.record.summaries,40,0.16461 +metadata.record.openAccessLink,40,0.16461 +metadata.record.contentNotes.text,9,0.03704 +metadata.record.publisherName,20,0.0823 +metadata.record.dissertationInfo.institution,1,0.00412 +metadata.record.dissertationInfo.year,1,0.00412 +metadata.record.digitalObjectInfo.thumbnailImage,1,0.00412 +metadata.record.digitalObjectInfo.iiifImageInfoUrl,1,0.00412 +metadata.record.digitalObjectInfo.iiifItemManifestUrl,1,0.00412 +metadata.record.digitalObjectInfo.contentDMCollectionId,1,0.00412 +metadata.record.digitalObjectInfo.iiifCollectionManifestUrl,1,0.00412 +metadata.record.digitalObjectInfo.oaiSetName,1,0.00412 +metadata.record.filmInfo,1,0.00412 diff --git a/registered_books_annas_archive_es.jsonl b/registered_books_annas_archive_es.jsonl new file mode 100644 index 0000000..0539699 --- /dev/null +++ b/registered_books_annas_archive_es.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8c3713837bfdec4604aad806b8ab8e56bd1bb5666867932e392c5ffddd69321 +size 1651151480 diff --git a/scripts/clean_spa.py b/scripts/clean_spa.py new file mode 100644 index 0000000..f3d8f7d --- /dev/null +++ b/scripts/clean_spa.py @@ -0,0 +1,94 @@ +import re +import json +import jsonlines +from pathlib import Path + +path_index = Path("index.txt") +curr_index = int(path_index.read_text().strip()) if path_index.exists() else 0 +index = 0 +oclc = set() +isbn = set() + + +def newitem(item1): + item2 = {"temas": [], "isbns": [], "descripcion": []} + keysmap = { + "title": "titulo", + "subjectsText": "temas", + "genre": "temas", + "subjects": "temas", + "series": "serie", + "physicalDescription": "paginas", + "oclcNumber": "oclc", + "isbns": "isbns", + "isbn13": "isbn13", + "publicationDate": "año", + "publisher": "editorial", + "summary": "descripcion", + "abstract": "descripcion", + "summaries": "descripcion", + "creator": "autor", + } + if "book" not in item1["generalFormat"].lower(): + return None + for key1 in item1.keys(): + if not item1[key1] or key1 not in keysmap.keys(): + continue + key2 = keysmap[key1] + if key2 not in item2.keys(): + item2[key2] = toint(item1[key1]) + else: + if isinstance(item1[key1], list): + [item2[key2].append(toint(val)) for val in item1[key1]] + else: + item2[key2].append(toint(item1[key1])) + [item2.setdefault(v, "") for v in keysmap.values()] + return item2 + + +def additem(dump, index, prefix): + filename = "%s_books_annas_archive_es.jsonl" % prefix + if index > curr_index: + print(index, dump[0:99]) + with open(filename, "a") as myfile: + myfile.write(dump) + else: + print("Ignoring item %s" % index) + + +def toint(val): + return int(val) if val.isdigit() else val + + +def setint(val1, rgx): + val2 = re.findall(rgx, str(val1)) + return int(val2[0]) if val2 else None + + +with jsonlines.open("annas_archive_es.jsonl") as reader: + for obj in reader: + try: + index += 1 + item = newitem(obj["metadata"]["record"]) + if not item: + continue + item["año"] = setint(item["año"], r"(\d+?)") + item["paginas"] = setint(item["paginas"], r"(\d+?)\s*pages") + if item["isbns"]: + isbns = list(map(lambda e: str(e), item["isbns"])) + extra = "ISBN Relacionados: %s" % ", ".join(isbns) + item["descripcion"].append(extra) + item["descripcion"] = "\n\n".join(item["descripcion"]) + if item["oclc"] not in oclc: + oclc.add(item["oclc"]) + dump = json.dumps(item, ensure_ascii=False).encode("utf8") + dump = dump.decode() + "\n" + if not item["isbn13"]: + additem(dump, index, "unregistered") + elif item["isbn13"] not in isbn: + isbn.add(item["isbn13"]) + additem(dump, index, "registered") + except Exception as err: + strs = (type(err).__name__, err.__traceback__.tb_lineno, str(err)) + print("%s at line %s: %s" % strs) + pass diff --git a/scripts/select_spa.sh b/scripts/select_spa.sh new file mode 100644 index 0000000..7427a2d --- /dev/null +++ b/scripts/select_spa.sh @@ -0,0 +1 @@ +jq -c 'select(.metadata.record.catalogingLanguage=="spa")' annas_archive.jsonl | tee -a annas_archive_es.jsonl diff --git a/unregistered_books_annas_archive_es.jsonl b/unregistered_books_annas_archive_es.jsonl new file mode 100644 index 0000000..eeff1c4 --- /dev/null +++ b/unregistered_books_annas_archive_es.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f2de2d0227404c8c0b5cc88bd7280632d98dbfa28b127feaef02d73e31f1d2f +size 4548521533