Tweaked the xls2json script to split the synonyms
This commit is contained in:
+9310
-752
File diff suppressed because it is too large
Load Diff
+6358
-752
File diff suppressed because it is too large
Load Diff
+5928
-752
File diff suppressed because it is too large
Load Diff
+19554
-752
File diff suppressed because it is too large
Load Diff
+7124
-752
File diff suppressed because it is too large
Load Diff
+6402
-752
File diff suppressed because it is too large
Load Diff
+7472
-752
File diff suppressed because it is too large
Load Diff
+7165
-752
File diff suppressed because it is too large
Load Diff
+5600
-752
File diff suppressed because it is too large
Load Diff
+6601
-752
File diff suppressed because it is too large
Load Diff
+6046
-752
File diff suppressed because it is too large
Load Diff
+5925
-752
File diff suppressed because it is too large
Load Diff
+6726
-752
File diff suppressed because it is too large
Load Diff
+7367
-752
File diff suppressed because it is too large
Load Diff
+7393
-752
File diff suppressed because it is too large
Load Diff
+6788
-752
File diff suppressed because it is too large
Load Diff
+6313
-752
File diff suppressed because it is too large
Load Diff
+6816
-752
File diff suppressed because it is too large
Load Diff
+5398
-752
File diff suppressed because it is too large
Load Diff
+6855
-752
File diff suppressed because it is too large
Load Diff
+5398
-752
File diff suppressed because it is too large
Load Diff
+17
-7
@@ -9,9 +9,11 @@
|
|||||||
# the project website on github: https://github.com/iluvcapra/ucs-community
|
# the project website on github: https://github.com/iluvcapra/ucs-community
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
import re
|
||||||
import json
|
import json
|
||||||
from typing import Dict
|
|
||||||
import os
|
import os
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
# this key is added to ever category entry created in the output files
|
# this key is added to ever category entry created in the output files
|
||||||
UCS_VERSION = "8.2.1"
|
UCS_VERSION = "8.2.1"
|
||||||
@@ -29,9 +31,11 @@ data = pd.read_excel(EXCEL_FILE)
|
|||||||
|
|
||||||
# The English column headers aren't formatted like the other languages so we
|
# The English column headers aren't formatted like the other languages so we
|
||||||
# just have to special-case them
|
# just have to special-case them
|
||||||
langs: Dict[str,Dict[int,str]] = {'en': {0: 'Category', 1: 'SubCategory', 2:
|
langs: Dict[str,Dict[int,str | List[str]]] = {'en': {0: 'Category', 1:
|
||||||
'CatID', 3:'CatShort',
|
'SubCategory', 2: 'CatID',
|
||||||
4:'Explanations', 5: 'Synonyms'}}
|
3:'CatShort',
|
||||||
|
4:'Explanations', 5:
|
||||||
|
'Synonyms'}}
|
||||||
|
|
||||||
# step through each column
|
# step through each column
|
||||||
for i, (col_index, col_data) in enumerate(data.T.iterrows()):
|
for i, (col_index, col_data) in enumerate(data.T.iterrows()):
|
||||||
@@ -63,13 +67,19 @@ for lang in langs:
|
|||||||
for (_, row) in rows:
|
for (_, row) in rows:
|
||||||
|
|
||||||
# create a dict for the category on this row
|
# create a dict for the category on this row
|
||||||
category = {'version':UCS_VERSION}
|
category: Dict[str, str | List[str]] = {'version':UCS_VERSION}
|
||||||
for col_index in langs[lang]:
|
for col_index in langs[lang]:
|
||||||
key_name = langs[lang][col_index]
|
key_name = str(langs[lang][col_index])
|
||||||
category[key_name] = row.iloc[col_index]
|
category[key_name] = str(row.iloc[col_index])
|
||||||
# Save the English CatID so this can be cross-referenced
|
# Save the English CatID so this can be cross-referenced
|
||||||
category['CatID'] = row.iloc[2]
|
category['CatID'] = row.iloc[2]
|
||||||
|
|
||||||
|
if key_name == 'Synonyms':
|
||||||
|
# synonyms are stored in the spreadsheet as CSV, we should
|
||||||
|
# normalize this.
|
||||||
|
category['Synonyms'] = re.split(r'\W+', category['Synonyms'])
|
||||||
|
|
||||||
|
|
||||||
schedule.append(category)
|
schedule.append(category)
|
||||||
|
|
||||||
# and dump it to json
|
# and dump it to json
|
||||||
|
|||||||
Reference in New Issue
Block a user