Commit ·
d43946a
1
Parent(s): 2237d8d
v3.2 update
Browse files- README.md +15 -13
- model.safetensors +1 -1
- src/train_t5.py +3 -3
- tokenizer.json +2 -16
README.md
CHANGED
|
@@ -22,16 +22,16 @@ model-index:
|
|
| 22 |
metrics:
|
| 23 |
- name: Training Loss
|
| 24 |
type: loss
|
| 25 |
-
value:
|
| 26 |
- name: Evaluation Loss
|
| 27 |
type: loss
|
| 28 |
-
value: 2.
|
| 29 |
- name: CER
|
| 30 |
type: cer
|
| 31 |
-
value: 0.
|
| 32 |
- name: Exact Match
|
| 33 |
type: accuracy
|
| 34 |
-
value: 0.
|
| 35 |
---
|
| 36 |
# AramT5 - T5 Fine-Tuned on Syriac-to-Latin Transliteration ♰
|
| 37 |
|
|
@@ -39,13 +39,13 @@ model-index:
|
|
| 39 |
|
| 40 |
⚠️ Current Limitations
|
| 41 |
|
| 42 |
-
-
|
| 43 |
- Occasional vowel omission or compression
|
| 44 |
-
-
|
| 45 |
|
| 46 |
> Development information
|
| 47 |
-
> - 🚧 **Current version:** v3.
|
| 48 |
-
> - ⏳ **Upcoming release:**
|
| 49 |
|
| 50 |
---
|
| 51 |
|
|
@@ -149,12 +149,14 @@ uv run python src/train_t5.py --stage 2 --hf-model your-username/model-name
|
|
| 149 |
|
| 150 |
## 📋 Version Changelog
|
| 151 |
|
| 152 |
-
* **AramT5 Baseline (May 20, 2026):**
|
| 153 |
|
| 154 |
-
* **AramT5 v1 (May 20, 2026):**
|
| 155 |
|
| 156 |
-
* **AramT5 v2 (May 20, 2026):**
|
| 157 |
|
| 158 |
-
* **AramT5 v3 (May 21, 2026):**
|
| 159 |
|
| 160 |
-
* **AramT5 v3.1 (May 22, 2026):**
|
|
|
|
|
|
|
|
|
| 22 |
metrics:
|
| 23 |
- name: Training Loss
|
| 24 |
type: loss
|
| 25 |
+
value: 1.9013
|
| 26 |
- name: Evaluation Loss
|
| 27 |
type: loss
|
| 28 |
+
value: 2.0293
|
| 29 |
- name: CER
|
| 30 |
type: cer
|
| 31 |
+
value: 0.1602
|
| 32 |
- name: Exact Match
|
| 33 |
type: accuracy
|
| 34 |
+
value: 0.6217
|
| 35 |
---
|
| 36 |
# AramT5 - T5 Fine-Tuned on Syriac-to-Latin Transliteration ♰
|
| 37 |
|
|
|
|
| 39 |
|
| 40 |
⚠️ Current Limitations
|
| 41 |
|
| 42 |
+
- Occasional under-generation (shorter outputs than expected)
|
| 43 |
- Occasional vowel omission or compression
|
| 44 |
+
- Reliability varies on very long, uncommon, or morphologically complex words and sentences
|
| 45 |
|
| 46 |
> Development information
|
| 47 |
+
> - 🚧 **Current version:** v3.2 (stage 4)
|
| 48 |
+
> - ⏳ **Upcoming release:** v4 (stage 5)
|
| 49 |
|
| 50 |
---
|
| 51 |
|
|
|
|
| 149 |
|
| 150 |
## 📋 Version Changelog
|
| 151 |
|
| 152 |
+
* **AramT5 Baseline (May 20, 2026):** Base `t5-small` model fine-tuned on 20k records, across 30 epochs, leveraging the stage 1 configuration. Baseline version with a surprisingly good initial understanding of how to transliterate properly, shown to capture some roots and Syriac morphology in a limited manner
|
| 153 |
|
| 154 |
+
* **AramT5 v1 (May 20, 2026):** Fine-tuned on 40k records, across 20 epochs, leveraging the stage 2 configuration. A massive upgrade compared to the baseline version, v1 showcased significantly improved morphological handling of not only single words but also sequences with noticeable complexity
|
| 155 |
|
| 156 |
+
* **AramT5 v2 (May 20, 2026):** Fine-tuned on 60k records, across 20 epochs, leveraging the stage 3 configuration. Making use of additional augmented data for atomic tokens, this version proved much more reliable at handling single-word input while exhibiting improvements in transliterating longer Syriac sentences
|
| 157 |
|
| 158 |
+
* **AramT5 v3 (May 21, 2026):** Fine-tuned on 80k records, across 20 epochs, leveraging the stage 4 configuration. This version showcased even stronger transliteration capabilities for longer sentences, while retaining existing knowledge on multiple single words
|
| 159 |
|
| 160 |
+
* **AramT5 v3.1 (May 22, 2026):** Fine-tuned on 120k records, across 20 epochs, leveraging the stage 4 configuration. Essentially a re-run or fine-tuning of v3, this version was trained on more data with a different distribution (and more manual entries) to leverage a more balanced mix between single words and multi-word phrases, culminating in a version that exhibits superior transliteration capabilities
|
| 161 |
+
|
| 162 |
+
* **AramT5 v.3.2 (May 23, 2026):** Fine-tuned on 120k records, across 10 epochs, leveraging the stage 4 configuration. A refinement of v3.1, this version leveraged corrected word forms, a more comprehensive manual vocabulary, and the addition of fully-vocalised and seyame-based plurals, resulting in the model correcting its understanding of various atomic words and learning a more comprehensive distinction between singular and plural words
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 209216552
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0dc22e4241c40835061a8408785a0b93cf5b139725732e806e28115dda2b26d
|
| 3 |
size 209216552
|
src/train_t5.py
CHANGED
|
@@ -188,7 +188,7 @@ STAGE_CONFIGS = {
|
|
| 188 |
"short_threshold": 50, # ≤50 chars (Stage 1+2+3)
|
| 189 |
"new_range_ratio": 0.45, # 45% from new range (51-70 chars)
|
| 190 |
"new_range_min": 51,
|
| 191 |
-
"num_epochs":
|
| 192 |
"learning_rate": 8e-5, # Higher LR to unlearn early-stopping bias from imbalanced data
|
| 193 |
},
|
| 194 |
5: {
|
|
@@ -199,7 +199,7 @@ STAGE_CONFIGS = {
|
|
| 199 |
"short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
|
| 200 |
"new_range_ratio": 0.45, # 45% from new range (71-100 chars)
|
| 201 |
"new_range_min": 71,
|
| 202 |
-
"num_epochs":
|
| 203 |
"learning_rate": 5e-5, # Slightly higher to reinforce multi-word patterns
|
| 204 |
"repetition_penalty": 1.2,
|
| 205 |
},
|
|
@@ -211,7 +211,7 @@ STAGE_CONFIGS = {
|
|
| 211 |
"short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
|
| 212 |
"new_range_ratio": 0.40, # 40% from new range (101-150 chars)
|
| 213 |
"new_range_min": 101,
|
| 214 |
-
"num_epochs":
|
| 215 |
"learning_rate": 4e-5, # Fine-tuning polish
|
| 216 |
"repetition_penalty": 1.2,
|
| 217 |
},
|
|
|
|
| 188 |
"short_threshold": 50, # ≤50 chars (Stage 1+2+3)
|
| 189 |
"new_range_ratio": 0.45, # 45% from new range (51-70 chars)
|
| 190 |
"new_range_min": 51,
|
| 191 |
+
"num_epochs": 10,
|
| 192 |
"learning_rate": 8e-5, # Higher LR to unlearn early-stopping bias from imbalanced data
|
| 193 |
},
|
| 194 |
5: {
|
|
|
|
| 199 |
"short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
|
| 200 |
"new_range_ratio": 0.45, # 45% from new range (71-100 chars)
|
| 201 |
"new_range_min": 71,
|
| 202 |
+
"num_epochs": 10,
|
| 203 |
"learning_rate": 5e-5, # Slightly higher to reinforce multi-word patterns
|
| 204 |
"repetition_penalty": 1.2,
|
| 205 |
},
|
|
|
|
| 211 |
"short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
|
| 212 |
"new_range_ratio": 0.40, # 40% from new range (101-150 chars)
|
| 213 |
"new_range_min": 101,
|
| 214 |
+
"num_epochs": 10,
|
| 215 |
"learning_rate": 4e-5, # Fine-tuning polish
|
| 216 |
"repetition_penalty": 1.2,
|
| 217 |
},
|
tokenizer.json
CHANGED
|
@@ -1,21 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
-
"truncation":
|
| 4 |
-
|
| 5 |
-
"max_length": 128,
|
| 6 |
-
"strategy": "LongestFirst",
|
| 7 |
-
"stride": 0
|
| 8 |
-
},
|
| 9 |
-
"padding": {
|
| 10 |
-
"strategy": {
|
| 11 |
-
"Fixed": 128
|
| 12 |
-
},
|
| 13 |
-
"direction": "Right",
|
| 14 |
-
"pad_to_multiple_of": null,
|
| 15 |
-
"pad_id": 0,
|
| 16 |
-
"pad_type_id": 0,
|
| 17 |
-
"pad_token": "<pad>"
|
| 18 |
-
},
|
| 19 |
"added_tokens": [
|
| 20 |
{
|
| 21 |
"id": 0,
|
|
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
+
"truncation": null,
|
| 4 |
+
"padding": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
| 7 |
"id": 0,
|