Init
This commit is contained in:
commit
285e112232
|
@ -0,0 +1,2 @@
|
|||
registered_books_annas_archive_es.jsonl filter=lfs diff=lfs merge=lfs -text
|
||||
unregistered_books_annas_archive_es.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
@ -0,0 +1,2 @@
|
|||
annas_archive.jsonl
|
||||
annas_archive_es.jsonl
|
|
@ -0,0 +1,68 @@
|
|||
key,len,freq
|
||||
aacid,243,1.0
|
||||
metadata.oclc_number,243,1.0
|
||||
metadata.type,243,1.0
|
||||
metadata.from_filenames,243,1.0
|
||||
metadata.record.oclcNumber,243,1.0
|
||||
metadata.record.title,243,1.0
|
||||
metadata.record.creator,243,1.0
|
||||
metadata.record.generalFormat,243,1.0
|
||||
metadata.record.specificFormat,243,1.0
|
||||
metadata.record.edition,243,1.0
|
||||
metadata.record.totalEditions,243,1.0
|
||||
metadata.record.publisher,243,1.0
|
||||
metadata.record.publicationPlace,226,0.93004
|
||||
metadata.record.publicationDate,243,1.0
|
||||
metadata.record.catalogingLanguage,243,1.0
|
||||
metadata.record.summary,76,0.31276
|
||||
metadata.record.physicalDescription,203,0.83539
|
||||
metadata.record.series,203,0.83539
|
||||
metadata.record.castNotes,203,0.83539
|
||||
metadata.record.languageNotes,203,0.83539
|
||||
metadata.record.subjectsText,203,0.83539
|
||||
metadata.record.cartographicData,203,0.83539
|
||||
metadata.record.dissertationInfo,196,0.80658
|
||||
metadata.record.performerNotes,203,0.83539
|
||||
metadata.record.genre,203,0.83539
|
||||
metadata.record.numericDesignation,203,0.83539
|
||||
metadata.record.audience,203,0.83539
|
||||
metadata.record.generalNotes,203,0.83539
|
||||
metadata.record.creditNotes,203,0.83539
|
||||
metadata.record.contentNotes,194,0.79835
|
||||
metadata.record.reproductionNotes,203,0.83539
|
||||
metadata.record.eventNotes,203,0.83539
|
||||
metadata.record.doi,203,0.83539
|
||||
metadata.record.peerReviewed,243,1.0
|
||||
metadata.record.mediumOfPerformance,203,0.83539
|
||||
metadata.record.issns,203,0.83539
|
||||
metadata.record.additionalPhysicalFormEntries,203,0.83539
|
||||
metadata.record.digitalAccessAndLocations,203,0.83539
|
||||
metadata.record.digitalObjectInfo,242,0.99588
|
||||
metadata.record.abstract,243,1.0
|
||||
metadata.record.evaluativeContent,203,0.83539
|
||||
metadata.record.otherFormats,243,1.0
|
||||
metadata.record.isbns,243,1.0
|
||||
metadata.record.isbn13,243,1.0
|
||||
metadata.record.openAccessLinks,217,0.893
|
||||
metadata.record.publication,243,1.0
|
||||
metadata.record.sourceIssn,203,0.83539
|
||||
metadata.record.sourceIsbns,203,0.83539
|
||||
metadata.record.contributors,243,1.0
|
||||
metadata.record.titleInfo.text,189,0.77778
|
||||
metadata.record.publisherName.text,169,0.69547
|
||||
metadata.record.machineReadableDate,42,0.17284
|
||||
metadata.record.seriesVolumes,42,0.17284
|
||||
metadata.record.subjects,40,0.16461
|
||||
metadata.record.summaries,40,0.16461
|
||||
metadata.record.openAccessLink,40,0.16461
|
||||
metadata.record.contentNotes.text,9,0.03704
|
||||
metadata.record.publisherName,20,0.0823
|
||||
metadata.record.dissertationInfo.institution,1,0.00412
|
||||
metadata.record.dissertationInfo.year,1,0.00412
|
||||
metadata.record.digitalObjectInfo.thumbnailImage,1,0.00412
|
||||
metadata.record.digitalObjectInfo.iiifImageInfoUrl,1,0.00412
|
||||
metadata.record.digitalObjectInfo.iiifItemManifestUrl,1,0.00412
|
||||
metadata.record.digitalObjectInfo.contentDMCollectionId,1,0.00412
|
||||
metadata.record.digitalObjectInfo.iiifCollectionManifestUrl,1,0.00412
|
||||
metadata.record.digitalObjectInfo.oaiSetName,1,0.00412
|
||||
metadata.record.filmInfo,1,0.00412
|
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e8c3713837bfdec4604aad806b8ab8e56bd1bb5666867932e392c5ffddd69321
|
||||
size 1651151480
|
|
@ -0,0 +1,94 @@
|
|||
import re
|
||||
import json
|
||||
import jsonlines
|
||||
from pathlib import Path
|
||||
|
||||
path_index = Path("index.txt")
|
||||
curr_index = int(path_index.read_text().strip()) if path_index.exists() else 0
|
||||
index = 0
|
||||
oclc = set()
|
||||
isbn = set()
|
||||
|
||||
|
||||
def newitem(item1):
|
||||
item2 = {"temas": [], "isbns": [], "descripcion": []}
|
||||
keysmap = {
|
||||
"title": "titulo",
|
||||
"subjectsText": "temas",
|
||||
"genre": "temas",
|
||||
"subjects": "temas",
|
||||
"series": "serie",
|
||||
"physicalDescription": "paginas",
|
||||
"oclcNumber": "oclc",
|
||||
"isbns": "isbns",
|
||||
"isbn13": "isbn13",
|
||||
"publicationDate": "año",
|
||||
"publisher": "editorial",
|
||||
"summary": "descripcion",
|
||||
"abstract": "descripcion",
|
||||
"summaries": "descripcion",
|
||||
"creator": "autor",
|
||||
}
|
||||
if "book" not in item1["generalFormat"].lower():
|
||||
return None
|
||||
for key1 in item1.keys():
|
||||
if not item1[key1] or key1 not in keysmap.keys():
|
||||
continue
|
||||
key2 = keysmap[key1]
|
||||
if key2 not in item2.keys():
|
||||
item2[key2] = toint(item1[key1])
|
||||
else:
|
||||
if isinstance(item1[key1], list):
|
||||
[item2[key2].append(toint(val)) for val in item1[key1]]
|
||||
else:
|
||||
item2[key2].append(toint(item1[key1]))
|
||||
[item2.setdefault(v, "") for v in keysmap.values()]
|
||||
return item2
|
||||
|
||||
|
||||
def additem(dump, index, prefix):
|
||||
filename = "%s_books_annas_archive_es.jsonl" % prefix
|
||||
if index > curr_index:
|
||||
print(index, dump[0:99])
|
||||
with open(filename, "a") as myfile:
|
||||
myfile.write(dump)
|
||||
else:
|
||||
print("Ignoring item %s" % index)
|
||||
|
||||
|
||||
def toint(val):
|
||||
return int(val) if val.isdigit() else val
|
||||
|
||||
|
||||
def setint(val1, rgx):
|
||||
val2 = re.findall(rgx, str(val1))
|
||||
return int(val2[0]) if val2 else None
|
||||
|
||||
|
||||
with jsonlines.open("annas_archive_es.jsonl") as reader:
|
||||
for obj in reader:
|
||||
try:
|
||||
index += 1
|
||||
item = newitem(obj["metadata"]["record"])
|
||||
if not item:
|
||||
continue
|
||||
item["año"] = setint(item["año"], r"(\d+?)")
|
||||
item["paginas"] = setint(item["paginas"], r"(\d+?)\s*pages")
|
||||
if item["isbns"]:
|
||||
isbns = list(map(lambda e: str(e), item["isbns"]))
|
||||
extra = "ISBN Relacionados: %s" % ", ".join(isbns)
|
||||
item["descripcion"].append(extra)
|
||||
item["descripcion"] = "\n\n".join(item["descripcion"])
|
||||
if item["oclc"] not in oclc:
|
||||
oclc.add(item["oclc"])
|
||||
dump = json.dumps(item, ensure_ascii=False).encode("utf8")
|
||||
dump = dump.decode() + "\n"
|
||||
if not item["isbn13"]:
|
||||
additem(dump, index, "unregistered")
|
||||
elif item["isbn13"] not in isbn:
|
||||
isbn.add(item["isbn13"])
|
||||
additem(dump, index, "registered")
|
||||
except Exception as err:
|
||||
strs = (type(err).__name__, err.__traceback__.tb_lineno, str(err))
|
||||
print("%s at line %s: %s" % strs)
|
||||
pass
|
|
@ -0,0 +1 @@
|
|||
jq -c 'select(.metadata.record.catalogingLanguage=="spa")' annas_archive.jsonl | tee -a annas_archive_es.jsonl
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6f2de2d0227404c8c0b5cc88bd7280632d98dbfa28b127feaef02d73e31f1d2f
|
||||
size 4548521533
|
Loading…
Reference in New Issue