This commit is contained in:
Titivillus 2024-04-12 21:37:29 +00:00
commit 285e112232
7 changed files with 173 additions and 0 deletions

2
.gitattributes vendored Normal file
View File

@ -0,0 +1,2 @@
registered_books_annas_archive_es.jsonl filter=lfs diff=lfs merge=lfs -text
unregistered_books_annas_archive_es.jsonl filter=lfs diff=lfs merge=lfs -text

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
annas_archive.jsonl
annas_archive_es.jsonl

68
keys_list.csv Normal file
View File

@ -0,0 +1,68 @@
key,len,freq
aacid,243,1.0
metadata.oclc_number,243,1.0
metadata.type,243,1.0
metadata.from_filenames,243,1.0
metadata.record.oclcNumber,243,1.0
metadata.record.title,243,1.0
metadata.record.creator,243,1.0
metadata.record.generalFormat,243,1.0
metadata.record.specificFormat,243,1.0
metadata.record.edition,243,1.0
metadata.record.totalEditions,243,1.0
metadata.record.publisher,243,1.0
metadata.record.publicationPlace,226,0.93004
metadata.record.publicationDate,243,1.0
metadata.record.catalogingLanguage,243,1.0
metadata.record.summary,76,0.31276
metadata.record.physicalDescription,203,0.83539
metadata.record.series,203,0.83539
metadata.record.castNotes,203,0.83539
metadata.record.languageNotes,203,0.83539
metadata.record.subjectsText,203,0.83539
metadata.record.cartographicData,203,0.83539
metadata.record.dissertationInfo,196,0.80658
metadata.record.performerNotes,203,0.83539
metadata.record.genre,203,0.83539
metadata.record.numericDesignation,203,0.83539
metadata.record.audience,203,0.83539
metadata.record.generalNotes,203,0.83539
metadata.record.creditNotes,203,0.83539
metadata.record.contentNotes,194,0.79835
metadata.record.reproductionNotes,203,0.83539
metadata.record.eventNotes,203,0.83539
metadata.record.doi,203,0.83539
metadata.record.peerReviewed,243,1.0
metadata.record.mediumOfPerformance,203,0.83539
metadata.record.issns,203,0.83539
metadata.record.additionalPhysicalFormEntries,203,0.83539
metadata.record.digitalAccessAndLocations,203,0.83539
metadata.record.digitalObjectInfo,242,0.99588
metadata.record.abstract,243,1.0
metadata.record.evaluativeContent,203,0.83539
metadata.record.otherFormats,243,1.0
metadata.record.isbns,243,1.0
metadata.record.isbn13,243,1.0
metadata.record.openAccessLinks,217,0.893
metadata.record.publication,243,1.0
metadata.record.sourceIssn,203,0.83539
metadata.record.sourceIsbns,203,0.83539
metadata.record.contributors,243,1.0
metadata.record.titleInfo.text,189,0.77778
metadata.record.publisherName.text,169,0.69547
metadata.record.machineReadableDate,42,0.17284
metadata.record.seriesVolumes,42,0.17284
metadata.record.subjects,40,0.16461
metadata.record.summaries,40,0.16461
metadata.record.openAccessLink,40,0.16461
metadata.record.contentNotes.text,9,0.03704
metadata.record.publisherName,20,0.0823
metadata.record.dissertationInfo.institution,1,0.00412
metadata.record.dissertationInfo.year,1,0.00412
metadata.record.digitalObjectInfo.thumbnailImage,1,0.00412
metadata.record.digitalObjectInfo.iiifImageInfoUrl,1,0.00412
metadata.record.digitalObjectInfo.iiifItemManifestUrl,1,0.00412
metadata.record.digitalObjectInfo.contentDMCollectionId,1,0.00412
metadata.record.digitalObjectInfo.iiifCollectionManifestUrl,1,0.00412
metadata.record.digitalObjectInfo.oaiSetName,1,0.00412
metadata.record.filmInfo,1,0.00412
1 key len freq
2 aacid 243 1.0
3 metadata.oclc_number 243 1.0
4 metadata.type 243 1.0
5 metadata.from_filenames 243 1.0
6 metadata.record.oclcNumber 243 1.0
7 metadata.record.title 243 1.0
8 metadata.record.creator 243 1.0
9 metadata.record.generalFormat 243 1.0
10 metadata.record.specificFormat 243 1.0
11 metadata.record.edition 243 1.0
12 metadata.record.totalEditions 243 1.0
13 metadata.record.publisher 243 1.0
14 metadata.record.publicationPlace 226 0.93004
15 metadata.record.publicationDate 243 1.0
16 metadata.record.catalogingLanguage 243 1.0
17 metadata.record.summary 76 0.31276
18 metadata.record.physicalDescription 203 0.83539
19 metadata.record.series 203 0.83539
20 metadata.record.castNotes 203 0.83539
21 metadata.record.languageNotes 203 0.83539
22 metadata.record.subjectsText 203 0.83539
23 metadata.record.cartographicData 203 0.83539
24 metadata.record.dissertationInfo 196 0.80658
25 metadata.record.performerNotes 203 0.83539
26 metadata.record.genre 203 0.83539
27 metadata.record.numericDesignation 203 0.83539
28 metadata.record.audience 203 0.83539
29 metadata.record.generalNotes 203 0.83539
30 metadata.record.creditNotes 203 0.83539
31 metadata.record.contentNotes 194 0.79835
32 metadata.record.reproductionNotes 203 0.83539
33 metadata.record.eventNotes 203 0.83539
34 metadata.record.doi 203 0.83539
35 metadata.record.peerReviewed 243 1.0
36 metadata.record.mediumOfPerformance 203 0.83539
37 metadata.record.issns 203 0.83539
38 metadata.record.additionalPhysicalFormEntries 203 0.83539
39 metadata.record.digitalAccessAndLocations 203 0.83539
40 metadata.record.digitalObjectInfo 242 0.99588
41 metadata.record.abstract 243 1.0
42 metadata.record.evaluativeContent 203 0.83539
43 metadata.record.otherFormats 243 1.0
44 metadata.record.isbns 243 1.0
45 metadata.record.isbn13 243 1.0
46 metadata.record.openAccessLinks 217 0.893
47 metadata.record.publication 243 1.0
48 metadata.record.sourceIssn 203 0.83539
49 metadata.record.sourceIsbns 203 0.83539
50 metadata.record.contributors 243 1.0
51 metadata.record.titleInfo.text 189 0.77778
52 metadata.record.publisherName.text 169 0.69547
53 metadata.record.machineReadableDate 42 0.17284
54 metadata.record.seriesVolumes 42 0.17284
55 metadata.record.subjects 40 0.16461
56 metadata.record.summaries 40 0.16461
57 metadata.record.openAccessLink 40 0.16461
58 metadata.record.contentNotes.text 9 0.03704
59 metadata.record.publisherName 20 0.0823
60 metadata.record.dissertationInfo.institution 1 0.00412
61 metadata.record.dissertationInfo.year 1 0.00412
62 metadata.record.digitalObjectInfo.thumbnailImage 1 0.00412
63 metadata.record.digitalObjectInfo.iiifImageInfoUrl 1 0.00412
64 metadata.record.digitalObjectInfo.iiifItemManifestUrl 1 0.00412
65 metadata.record.digitalObjectInfo.contentDMCollectionId 1 0.00412
66 metadata.record.digitalObjectInfo.iiifCollectionManifestUrl 1 0.00412
67 metadata.record.digitalObjectInfo.oaiSetName 1 0.00412
68 metadata.record.filmInfo 1 0.00412

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e8c3713837bfdec4604aad806b8ab8e56bd1bb5666867932e392c5ffddd69321
size 1651151480

94
scripts/clean_spa.py Normal file
View File

@ -0,0 +1,94 @@
import re
import json
import jsonlines
from pathlib import Path
path_index = Path("index.txt")
curr_index = int(path_index.read_text().strip()) if path_index.exists() else 0
index = 0
oclc = set()
isbn = set()
def newitem(item1):
item2 = {"temas": [], "isbns": [], "descripcion": []}
keysmap = {
"title": "titulo",
"subjectsText": "temas",
"genre": "temas",
"subjects": "temas",
"series": "serie",
"physicalDescription": "paginas",
"oclcNumber": "oclc",
"isbns": "isbns",
"isbn13": "isbn13",
"publicationDate": "año",
"publisher": "editorial",
"summary": "descripcion",
"abstract": "descripcion",
"summaries": "descripcion",
"creator": "autor",
}
if "book" not in item1["generalFormat"].lower():
return None
for key1 in item1.keys():
if not item1[key1] or key1 not in keysmap.keys():
continue
key2 = keysmap[key1]
if key2 not in item2.keys():
item2[key2] = toint(item1[key1])
else:
if isinstance(item1[key1], list):
[item2[key2].append(toint(val)) for val in item1[key1]]
else:
item2[key2].append(toint(item1[key1]))
[item2.setdefault(v, "") for v in keysmap.values()]
return item2
def additem(dump, index, prefix):
filename = "%s_books_annas_archive_es.jsonl" % prefix
if index > curr_index:
print(index, dump[0:99])
with open(filename, "a") as myfile:
myfile.write(dump)
else:
print("Ignoring item %s" % index)
def toint(val):
return int(val) if val.isdigit() else val
def setint(val1, rgx):
val2 = re.findall(rgx, str(val1))
return int(val2[0]) if val2 else None
with jsonlines.open("annas_archive_es.jsonl") as reader:
for obj in reader:
try:
index += 1
item = newitem(obj["metadata"]["record"])
if not item:
continue
item["año"] = setint(item["año"], r"(\d+?)")
item["paginas"] = setint(item["paginas"], r"(\d+?)\s*pages")
if item["isbns"]:
isbns = list(map(lambda e: str(e), item["isbns"]))
extra = "ISBN Relacionados: %s" % ", ".join(isbns)
item["descripcion"].append(extra)
item["descripcion"] = "\n\n".join(item["descripcion"])
if item["oclc"] not in oclc:
oclc.add(item["oclc"])
dump = json.dumps(item, ensure_ascii=False).encode("utf8")
dump = dump.decode() + "\n"
if not item["isbn13"]:
additem(dump, index, "unregistered")
elif item["isbn13"] not in isbn:
isbn.add(item["isbn13"])
additem(dump, index, "registered")
except Exception as err:
strs = (type(err).__name__, err.__traceback__.tb_lineno, str(err))
print("%s at line %s: %s" % strs)
pass

1
scripts/select_spa.sh Normal file
View File

@ -0,0 +1 @@
jq -c 'select(.metadata.record.catalogingLanguage=="spa")' annas_archive.jsonl | tee -a annas_archive_es.jsonl

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6f2de2d0227404c8c0b5cc88bd7280632d98dbfa28b127feaef02d73e31f1d2f
size 4548521533