Tweaked the xls2json script to split the synonyms

This commit is contained in:
Jamie Hardt
2024-10-27 19:45:34 -07:00
parent ac86cbe839
commit 558c53c34c
22 changed files with 152556 additions and 15799 deletions
+9310 -752
View File
File diff suppressed because it is too large Load Diff
+6358 -752
View File
File diff suppressed because it is too large Load Diff
+5928 -752
View File
File diff suppressed because it is too large Load Diff
+19554 -752
View File
File diff suppressed because it is too large Load Diff
+7124 -752
View File
File diff suppressed because it is too large Load Diff
+6402 -752
View File
File diff suppressed because it is too large Load Diff
+7472 -752
View File
File diff suppressed because it is too large Load Diff
+7165 -752
View File
File diff suppressed because it is too large Load Diff
+5600 -752
View File
File diff suppressed because it is too large Load Diff
+6601 -752
View File
File diff suppressed because it is too large Load Diff
+6046 -752
View File
File diff suppressed because it is too large Load Diff
+5925 -752
View File
File diff suppressed because it is too large Load Diff
+6726 -752
View File
File diff suppressed because it is too large Load Diff
+7367 -752
View File
File diff suppressed because it is too large Load Diff
+7393 -752
View File
File diff suppressed because it is too large Load Diff
+6788 -752
View File
File diff suppressed because it is too large Load Diff
+6313 -752
View File
File diff suppressed because it is too large Load Diff
+6816 -752
View File
File diff suppressed because it is too large Load Diff
+5398 -752
View File
File diff suppressed because it is too large Load Diff
+6855 -752
View File
File diff suppressed because it is too large Load Diff
+5398 -752
View File
File diff suppressed because it is too large Load Diff
+17 -7
View File
@@ -9,9 +9,11 @@
# the project website on github: https://github.com/iluvcapra/ucs-community # the project website on github: https://github.com/iluvcapra/ucs-community
import pandas as pd import pandas as pd
import re
import json import json
from typing import Dict
import os import os
from typing import Dict, List
# this key is added to ever category entry created in the output files # this key is added to ever category entry created in the output files
UCS_VERSION = "8.2.1" UCS_VERSION = "8.2.1"
@@ -29,9 +31,11 @@ data = pd.read_excel(EXCEL_FILE)
# The English column headers aren't formatted like the other languages so we # The English column headers aren't formatted like the other languages so we
# just have to special-case them # just have to special-case them
langs: Dict[str,Dict[int,str]] = {'en': {0: 'Category', 1: 'SubCategory', 2: langs: Dict[str,Dict[int,str | List[str]]] = {'en': {0: 'Category', 1:
'CatID', 3:'CatShort', 'SubCategory', 2: 'CatID',
4:'Explanations', 5: 'Synonyms'}} 3:'CatShort',
4:'Explanations', 5:
'Synonyms'}}
# step through each column # step through each column
for i, (col_index, col_data) in enumerate(data.T.iterrows()): for i, (col_index, col_data) in enumerate(data.T.iterrows()):
@@ -63,12 +67,18 @@ for lang in langs:
for (_, row) in rows: for (_, row) in rows:
# create a dict for the category on this row # create a dict for the category on this row
category = {'version':UCS_VERSION} category: Dict[str, str | List[str]] = {'version':UCS_VERSION}
for col_index in langs[lang]: for col_index in langs[lang]:
key_name = langs[lang][col_index] key_name = str(langs[lang][col_index])
category[key_name] = row.iloc[col_index] category[key_name] = str(row.iloc[col_index])
# Save the English CatID so this can be cross-referenced # Save the English CatID so this can be cross-referenced
category['CatID'] = row.iloc[2] category['CatID'] = row.iloc[2]
if key_name == 'Synonyms':
# synonyms are stored in the spreadsheet as CSV, we should
# normalize this.
category['Synonyms'] = re.split(r'\W+', category['Synonyms'])
schedule.append(category) schedule.append(category)