gladiaio · egenthon-cmd · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026 · Apr 7, 2026
diff --git a/normalization/languages/french/number_normalizer.py b/normalization/languages/french/number_normalizer.py
@@ -0,0 +1,72 @@
+"""French number normalizer using text2num's alpha2digit.
+
+Converts spelled-out numbers to digits (e.g. vingt trois → 23) and handles
+mixed digit+word forms (e.g. 3 milliards → trois milliards) before conversion
+so alpha2digit does not misinterpret them.
+"""
+
+import re
+
+try:
+    from text_to_num import alpha2digit
+except ImportError:
+    alpha2digit = None
+
+
+# Digit-to-French word mapping for normalizing "3 milliards" → "trois milliards".
+_DIGIT_TO_FRENCH: dict[str, str] = {
+    "0": "zéro",
+    "1": "un",
+    "2": "deux",
+    "3": "trois",
+    "4": "quatre",
+    "5": "cinq",
+    "6": "six",
+    "7": "sept",
+    "8": "huit",
+    "9": "neuf",
+}
+
+# Pattern: digit(s) followed by millions/milliards (French) or billions/trillions.
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+(millions?|milliards?|billions?|trillions?)\b",
+    re.IGNORECASE,
+)
+
+
+def _normalize_mixed_numbers(text: str) -> str:
+    """Convert '3 milliards' → 'trois milliards' so alpha2digit yields 3e9, not 31e9.
+
+    alpha2digit may concatenate a lone digit with the following word; converting
+    the digit to a word avoids that (e.g. 'trois milliards' → 3000000000).
+    """
+
+    def replace(match: re.Match) -> str:
+        number = match.group(1)
+        multiplier = match.group(2)
+        if len(number) == 1 and number in _DIGIT_TO_FRENCH:
+            return f"{_DIGIT_TO_FRENCH[number]} {multiplier}"
+        # Multi-digit: keep as-is; alpha2digit will handle or leave unchanged
+        return match.group(0)
+
+    return _RE_MIXED_NUMBER.sub(replace, text)
+
+
+class FrenchNumberNormalizer:
+    """Convert French spelled-out numbers to digits via text2num.alpha2digit.
+
+    Applies a pre-pass to normalize mixed digit+word forms (e.g. 3 milliards)
+    before calling alpha2digit.
+    """
+
+    def __init__(self) -> None:
+        if alpha2digit is None:
+            raise ImportError(
+                "French number normalization requires the text2num package. "
+                "Install it with: uv add text2num"
+            )
+        self._alpha2digit = alpha2digit
+
+    def __call__(self, text: str) -> str:
+        text = _normalize_mixed_numbers(text)
+        return self._alpha2digit(text, "fr")
diff --git a/normalization/languages/french/operators.py b/normalization/languages/french/operators.py
@@ -1,9 +1,30 @@
+import re
+
 from normalization.languages.base import (
     LanguageConfig,
     LanguageOperators,
 )
+from normalization.languages.french.number_normalizer import FrenchNumberNormalizer
+from normalization.languages.french.replacements import FRENCH_REPLACEMENTS
+from normalization.languages.french.sentence_replacements import (
+    FRENCH_SENTENCE_REPLACEMENTS,
+)
 from normalization.languages.registry import register_language
 
+# French digit words (0-9) for steps that need digit-word recognition.
+_FRENCH_DIGIT_WORDS: dict[str, str] = {
+    "zéro": "0",
+    "un": "1",
+    "deux": "2",
+    "trois": "3",
+    "quatre": "4",
+    "cinq": "5",
+    "six": "6",
+    "sept": "7",
+    "huit": "8",
+    "neuf": "9",
+}
+
 FRENCH_CONFIG = LanguageConfig(
     code="fr",
     decimal_separator=",",
@@ -29,10 +50,89 @@
         "¥": "yens",
     },
     filler_words=["euh", "hum", "beh", "bah", "ben", "hein"],
+    digit_words=_FRENCH_DIGIT_WORDS,
+    sentence_replacements=FRENCH_SENTENCE_REPLACEMENTS,
+    number_words=[
+        "zéro",
+        "un",
+        "deux",
+        "trois",
+        "quatre",
+        "cinq",
+        "six",
+        "sept",
+        "huit",
+        "neuf",
+        "dix",
+        "onze",
+        "douze",
+        "treize",
+        "quatorze",
+        "quinze",
+        "seize",
+        "vingt",
+        "trente",
+        "quarante",
+        "cinquante",
+        "soixante",
+        "septante",
+        "octante",
+        "huitante",
+        "nonante",
+        "cent",
+        "mille",
+        "million",
+        "millions",
+        "milliard",
+        "milliards",
+        "billion",
+        "billions",
+        "trillion",
+        "trillions",
+    ],
+    plus_word="plus",
 )
 
 
 @register_language
 class FrenchOperators(LanguageOperators):
-    def __init__(self):
+    """French language operators: contractions, written numbers, word replacements."""
+
+    def __init__(self) -> None:
         super().__init__(FRENCH_CONFIG)
+        self._number_normalizer = FrenchNumberNormalizer()
+
+    def expand_contractions(self, text: str) -> str:
+        """Expand French informal spoken contractions before consonants only.
+
+        French elision (apostrophe before a vowel or h) is the standard written form and
+        must be preserved: j'ai, c'est, l'ami, d'accord stay as-is because expanding them
+        would produce adjacent vowels that are incorrect in written French.
+
+        Only expand when the apostrophe is followed by a consonant — those are informal
+        spoken reductions (j'veux → je veux, j'suis → je suis, s'pas → se pas).
+        """
+        # Vowels + h: elision before these is standard written French — do not expand.
+        vowels = "aàâeéèêiîïoôuùûyh"
+        _V = rf"(?![{vowels}{vowels.upper()}])"
+        text = re.sub(rf"\bj'{_V}", "je ", text, flags=re.IGNORECASE)
+        text = re.sub(rf"\bc'{_V}", "ce ", text, flags=re.IGNORECASE)
+        text = re.sub(rf"\bd'{_V}", "de ", text, flags=re.IGNORECASE)
+        text = re.sub(rf"\bqu'{_V}", "que ", text, flags=re.IGNORECASE)
+        text = re.sub(rf"\bn'{_V}", "ne ", text, flags=re.IGNORECASE)
+        text = re.sub(rf"\bs'{_V}", "se ", text, flags=re.IGNORECASE)
+        text = re.sub(rf"\bm'{_V}", "me ", text, flags=re.IGNORECASE)
+        text = re.sub(rf"\bt'{_V}", "te ", text, flags=re.IGNORECASE)
+        text = re.sub(rf"\bl'{_V}", "le ", text, flags=re.IGNORECASE)
+        return text
+
+    def expand_written_numbers(self, text: str) -> str:
+        """Convert French spelled-out numbers to digits (vingt trois → 23).
+
+        Uses FrenchNumberNormalizer, which normalizes mixed forms (3 milliards → trois milliards)
+        then text2num.alpha2digit.
+        """
+        return self._number_normalizer(text)
+
+    def get_word_replacements(self) -> dict[str, str]:
+        return FRENCH_REPLACEMENTS
diff --git a/normalization/languages/french/replacements.py b/normalization/languages/french/replacements.py
@@ -1 +1,28 @@
-FRENCH_REPLACEMENTS: dict[str, str] = {}
+FRENCH_REPLACEMENTS = {
+    # contractions in titles/prefixes
+    "mme": "madame",
+    "mlle": "mademoiselle",
+    "mr": "monsieur",
+    "st": "saint",
+    "dr": "docteur",
+    "prof": "professeur",
+    "pr": "professeur",
+    # sports
+    "volley-ball": "volleyball",
+    "basket-ball": "basketball",
+    "water-polo": "waterpolo",
+    "ping-pong": "pingpong",
+    "hand-ball": "handball",
+    # Tech / quotidien
+    "wi-fi": "wifi",
+    "cd-rom": "cdrom",
+    "t-shirt": "tshirt",
+    "chat-bot": "chatbot",
+    "blogue": "blog",
+    "e-mail": "email",
+    "week-end": "weekend",
+    "week-ends": "weekends",
+    "porte-monnaie": "portemonnaie",
+    "porte-feuille": "portefeuille",
+    "extra-terrestre": "extraterrestre",
+}
diff --git a/normalization/languages/french/sentence_replacements.py b/normalization/languages/french/sentence_replacements.py
@@ -0,0 +1,12 @@
+FRENCH_SENTENCE_REPLACEMENTS: dict[str, str] = {
+    "super predateur": "superprédateur",
+    "ping pong": "pingpong",
+    "hand ball": "handball",
+    "water polo": "waterpolo",
+    "basket ball": "basketball",
+    "volley ball": "volleyball",
+    "wi fi": "wifi",
+    "cd rom": "cdrom",
+    "t shirt": "tshirt",
+    "pour 100": "pourcent",
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,14 @@ classifiers = [
 ]
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
-dependencies = ["contractions>=0.1.73", "pyyaml>=6.0.3"]
+dependencies = [
+    "contractions>=0.1.73",
+    "pyyaml>=6.0.3",
+    "text2num>=3.0.0",
+]
+
+[tool.setuptools.package-data]
+normalization = ["presets/*.yaml"]
 
 [project.urls]
 Homepage = "https://github.com/gladiaio/normalization"

diff --git a/tests/e2e/files/gladia-3.csv b/tests/e2e/files/gladia-3.csv
@@ -123,4 +123,56 @@ x = 5,x equals 5,en
 ¥1000,1000 yens,en
 ø in Danish,o in danish,en
 €20 or €30,20 euros or 30 euros,en
-my  name is bob,my name is bob,en
+my  name is bob,my name is bob,en
+j'ai dit c'est bien,j ai dit c est bien,fr
+vingt trois pommes,23 pommes,fr
+3 milliards d euros,3000000000 d euros,fr
+euh alors hein bah oui,alors oui,fr
+"12,5 degrés",12 virgule 5 degres,fr
+pour 100 de réduction,pourcent de reduction,fr
+"Hello, world!",hello world,default
+ça va?!,ca va,default
+$100,$100,default
+80 €,80 €,default
+test@example.com,test@example.com,default
++1234567890,+1234567890,default
+one two three,one two three,default
+5:30 pm,5:30 pm,default
+d'accord,d accord,fr
+qu'il vient,qu il vient,fr
+n'est pas,n est pas,fr
+l'ordinateur,l ordinateur,fr
+m'appelle,m appelle,fr
+s'il vous plait,s il vous plait,fr
+t'as vu,t as vu,fr
+cent euros,100 euros,fr
+mille deux cents,1200,fr
+cinquante trois,53,fr
+contact@exemple.fr,contact arobase exemple point fr,fr
+"2 < 5",2 plus petit que 5,fr
+50°C,50 degres celsius,fr
+ca coute €50,ca coute 50 euros,fr
+euh bonjour hein,bonjour,fr
+mme dupont,madame dupont,fr
+mlle dubois,mademoiselle dubois,fr
+dr martin,docteur martin,fr
+prof dupont,professeur dupont,fr
+st jean,saint jean,fr
+ping pong,pingpong,fr
+volley ball,volleyball,fr
+basket ball,basketball,fr
+hand ball,handball,fr
+water polo,waterpolo,fr
+t shirt,tshirt,fr
+cd rom,cdrom,fr
+super predateur,superpredateur,fr
+"3,14 pi",3 virgule 14 pi,fr
+soixante-dix,70,fr
+quatre-vingts,80,fr
+quatre-vingt-un,81,fr
+nonante-neuf,99,fr
+septante et un,71,fr
+x = 5,x egal a 5,fr
+test@example.com,test arobase example point com,fr
+bonjour (euh) ami,bonjour ami,fr
+ça date d'hier,ca date d hier,fr
diff --git a/tests/unit/steps/text/apply_sentence_level_replacements_test.py b/tests/unit/steps/text/apply_sentence_level_replacements_test.py
@@ -0,0 +1,18 @@
+from normalization.languages.french import FrenchOperators
+from normalization.steps.text.apply_sentence_level_replacements import (
+    ApplySentenceLevelReplacementsStep,
+)
+
+from .conftest import assert_text_step_registered
+
+
+def test_step_is_registered():
+    assert_text_step_registered(ApplySentenceLevelReplacementsStep)
+
+
+def test_apply_sentence_level_replacements_step_french_pour_100(
+    french_operators: FrenchOperators,
+):
+    text = "pour 100 de réduction"
+    formatted_text = ApplySentenceLevelReplacementsStep()(text, french_operators)
+    assert formatted_text == "pourcent de réduction"
diff --git a/tests/unit/steps/text/conftest.py b/tests/unit/steps/text/conftest.py
@@ -2,6 +2,7 @@
 
 from normalization.languages.base import LanguageOperators
 from normalization.languages.english import EnglishOperators
+from normalization.languages.french import FrenchOperators
 from normalization.steps import get_step_registry
 
 
@@ -15,6 +16,11 @@ def english_operators():
     return EnglishOperators()
 
 
+@pytest.fixture
+def french_operators():
+    return FrenchOperators()
+
+
 def assert_text_step_registered(step_cls):
     """Verify a text step is properly registered under its name."""
     registry = get_step_registry()

diff --git a/tests/unit/steps/text/expand_contractions_test.py b/tests/unit/steps/text/expand_contractions_test.py
@@ -0,0 +1,21 @@
+from normalization.languages.english import EnglishOperators
+from normalization.languages.french import FrenchOperators
+from normalization.steps.text.expand_contractions import ExpandContractionsStep
+
+from .conftest import assert_text_step_registered
+
+
+def test_step_is_registered():
+    assert_text_step_registered(ExpandContractionsStep)
+
+
+def test_expand_contractions_step_english(english_operators: EnglishOperators):
+    text = "he ain't gonna"
+    formatted_text = ExpandContractionsStep()(text, english_operators)
+    assert formatted_text == "he is not going to"
+
+
+def test_expand_contractions_step_french(french_operators: FrenchOperators):
+    text = "j'ai dit c'est bien"
+    formatted_text = ExpandContractionsStep()(text, french_operators)
+    assert formatted_text == "j'ai dit c'est bien"