Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions normalization/languages/french/number_normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""French number normalizer using text2num's alpha2digit.

Converts spelled-out numbers to digits (e.g. vingt trois → 23) and handles
mixed digit+word forms (e.g. 3 milliards → trois milliards) before conversion
so alpha2digit does not misinterpret them.
"""

import re

try:
from text_to_num import alpha2digit
except ImportError:
alpha2digit = None


# Digit-to-French word mapping for normalizing "3 milliards" → "trois milliards".
_DIGIT_TO_FRENCH: dict[str, str] = {
"0": "zéro",
"1": "un",
"2": "deux",
"3": "trois",
"4": "quatre",
"5": "cinq",
"6": "six",
"7": "sept",
"8": "huit",
"9": "neuf",
}

# Pattern: digit(s) followed by millions/milliards (French) or billions/trillions.
_RE_MIXED_NUMBER = re.compile(
r"\b(\d+)\s+(millions?|milliards?|billions?|trillions?)\b",
re.IGNORECASE,
)


def _normalize_mixed_numbers(text: str) -> str:
"""Convert '3 milliards' → 'trois milliards' so alpha2digit yields 3e9, not 31e9.

alpha2digit may concatenate a lone digit with the following word; converting
the digit to a word avoids that (e.g. 'trois milliards' → 3000000000).
"""

def replace(match: re.Match) -> str:
number = match.group(1)
multiplier = match.group(2)
if len(number) == 1 and number in _DIGIT_TO_FRENCH:
return f"{_DIGIT_TO_FRENCH[number]} {multiplier}"
# Multi-digit: keep as-is; alpha2digit will handle or leave unchanged
return match.group(0)

return _RE_MIXED_NUMBER.sub(replace, text)


class FrenchNumberNormalizer:
"""Convert French spelled-out numbers to digits via text2num.alpha2digit.

Applies a pre-pass to normalize mixed digit+word forms (e.g. 3 milliards)
before calling alpha2digit.
"""

def __init__(self) -> None:
if alpha2digit is None:
raise ImportError(
"French number normalization requires the text2num package. "
"Install it with: uv add text2num"
)
self._alpha2digit = alpha2digit

def __call__(self, text: str) -> str:
text = _normalize_mixed_numbers(text)
return self._alpha2digit(text, "fr")
102 changes: 101 additions & 1 deletion normalization/languages/french/operators.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,30 @@
import re

from normalization.languages.base import (
LanguageConfig,
LanguageOperators,
)
from normalization.languages.french.number_normalizer import FrenchNumberNormalizer
from normalization.languages.french.replacements import FRENCH_REPLACEMENTS
from normalization.languages.french.sentence_replacements import (
FRENCH_SENTENCE_REPLACEMENTS,
)
from normalization.languages.registry import register_language

# French digit words (0-9) for steps that need digit-word recognition.
_FRENCH_DIGIT_WORDS: dict[str, str] = {
"zéro": "0",
"un": "1",
"deux": "2",
"trois": "3",
"quatre": "4",
"cinq": "5",
"six": "6",
"sept": "7",
"huit": "8",
"neuf": "9",
}

FRENCH_CONFIG = LanguageConfig(
code="fr",
decimal_separator=",",
Expand All @@ -29,10 +50,89 @@
"¥": "yens",
},
filler_words=["euh", "hum", "beh", "bah", "ben", "hein"],
digit_words=_FRENCH_DIGIT_WORDS,
sentence_replacements=FRENCH_SENTENCE_REPLACEMENTS,
number_words=[
"zéro",
"un",
"deux",
"trois",
"quatre",
"cinq",
"six",
"sept",
"huit",
"neuf",
"dix",
"onze",
"douze",
"treize",
"quatorze",
"quinze",
"seize",
"vingt",
"trente",
"quarante",
"cinquante",
"soixante",
"septante",
"octante",
"huitante",
"nonante",
"cent",
"mille",
"million",
"millions",
"milliard",
"milliards",
"billion",
"billions",
"trillion",
"trillions",
],
Comment on lines +55 to +92
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

number_words is missing common standard French forms.

This list is used for number-word detection, but ordinary spellings like dix-sept, dix-huit, dix-neuf, soixante-dix, quatre-vingt, quatre-vingts, and quatre-vingt-dix are absent while rarer regional variants are present. Those phrases will be skipped by config-driven steps.

💡 At minimum, add the standard hyphenated forms
     number_words=[
         "quinze",
         "seize",
+        "dix-sept",
+        "dix-huit",
+        "dix-neuf",
         "vingt",
         "trente",
         "quarante",
         "cinquante",
         "soixante",
+        "soixante-dix",
         "septante",
         "octante",
         "huitante",
         "nonante",
+        "quatre-vingt",
+        "quatre-vingts",
+        "quatre-vingt-dix",
         "cent",
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@normalization/languages/french/operators.py` around lines 55 - 92, The
number_words list is missing common hyphenated French numerals; update the
number_words variable to include standard hyphenated forms such as "dix-sept",
"dix-huit", "dix-neuf", "soixante-dix", "quatre-vingt", "quatre-vingts", and
"quatre-vingt-dix" so the detection covers ordinary French spellings (modify the
number_words array in the French operators module).

plus_word="plus",
)


@register_language
class FrenchOperators(LanguageOperators):
def __init__(self):
"""French language operators: contractions, written numbers, word replacements."""

def __init__(self) -> None:
super().__init__(FRENCH_CONFIG)
self._number_normalizer = FrenchNumberNormalizer()

def expand_contractions(self, text: str) -> str:
"""Expand French informal spoken contractions before consonants only.

French elision (apostrophe before a vowel or h) is the standard written form and
must be preserved: j'ai, c'est, l'ami, d'accord stay as-is because expanding them
would produce adjacent vowels that are incorrect in written French.

Only expand when the apostrophe is followed by a consonant — those are informal
spoken reductions (j'veux → je veux, j'suis → je suis, s'pas → se pas).
"""
# Vowels + h: elision before these is standard written French — do not expand.
vowels = "aàâeéèêiîïoôuùûyh"
_V = rf"(?![{vowels}{vowels.upper()}])"
text = re.sub(rf"\bj'{_V}", "je ", text, flags=re.IGNORECASE)
text = re.sub(rf"\bc'{_V}", "ce ", text, flags=re.IGNORECASE)
text = re.sub(rf"\bd'{_V}", "de ", text, flags=re.IGNORECASE)
text = re.sub(rf"\bqu'{_V}", "que ", text, flags=re.IGNORECASE)
text = re.sub(rf"\bn'{_V}", "ne ", text, flags=re.IGNORECASE)
text = re.sub(rf"\bs'{_V}", "se ", text, flags=re.IGNORECASE)
text = re.sub(rf"\bm'{_V}", "me ", text, flags=re.IGNORECASE)
text = re.sub(rf"\bt'{_V}", "te ", text, flags=re.IGNORECASE)
text = re.sub(rf"\bl'{_V}", "le ", text, flags=re.IGNORECASE)
return text

def expand_written_numbers(self, text: str) -> str:
"""Convert French spelled-out numbers to digits (vingt trois → 23).

Uses FrenchNumberNormalizer, which normalizes mixed forms (3 milliards → trois milliards)
then text2num.alpha2digit.
"""
return self._number_normalizer(text)

def get_word_replacements(self) -> dict[str, str]:
return FRENCH_REPLACEMENTS
29 changes: 28 additions & 1 deletion normalization/languages/french/replacements.py
Original file line number Diff line number Diff line change
@@ -1 +1,28 @@
FRENCH_REPLACEMENTS: dict[str, str] = {}
FRENCH_REPLACEMENTS = {
# contractions in titles/prefixes
"mme": "madame",
"mlle": "mademoiselle",
"mr": "monsieur",
"st": "saint",
"dr": "docteur",
"prof": "professeur",
"pr": "professeur",
# sports
"volley-ball": "volleyball",
"basket-ball": "basketball",
"water-polo": "waterpolo",
"ping-pong": "pingpong",
"hand-ball": "handball",
# Tech / quotidien
"wi-fi": "wifi",
"cd-rom": "cdrom",
"t-shirt": "tshirt",
"chat-bot": "chatbot",
"blogue": "blog",
"e-mail": "email",
"week-end": "weekend",
"week-ends": "weekends",
"porte-monnaie": "portemonnaie",
"porte-feuille": "portefeuille",
"extra-terrestre": "extraterrestre",
}
12 changes: 12 additions & 0 deletions normalization/languages/french/sentence_replacements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FRENCH_SENTENCE_REPLACEMENTS: dict[str, str] = {
"super predateur": "superprédateur",
"ping pong": "pingpong",
"hand ball": "handball",
"water polo": "waterpolo",
"basket ball": "basketball",
"volley ball": "volleyball",
"wi fi": "wifi",
"cd rom": "cdrom",
"t shirt": "tshirt",
"pour 100": "pourcent",
}
9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,14 @@ classifiers = [
]
license = { file = "LICENSE" }
requires-python = ">=3.10"
dependencies = ["contractions>=0.1.73", "pyyaml>=6.0.3"]
dependencies = [
"contractions>=0.1.73",
"pyyaml>=6.0.3",
"text2num>=3.0.0",
]

[tool.setuptools.package-data]
normalization = ["presets/*.yaml"]

[project.urls]
Homepage = "https://github.com/gladiaio/normalization"
Expand Down
54 changes: 53 additions & 1 deletion tests/e2e/files/gladia-3.csv
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,56 @@ x = 5,x equals 5,en
¥1000,1000 yens,en
ø in Danish,o in danish,en
€20 or €30,20 euros or 30 euros,en
my name is bob,my name is bob,en
my name is bob,my name is bob,en
j'ai dit c'est bien,j ai dit c est bien,fr
vingt trois pommes,23 pommes,fr
3 milliards d euros,3000000000 d euros,fr
euh alors hein bah oui,alors oui,fr
"12,5 degrés",12 virgule 5 degres,fr
pour 100 de réduction,pourcent de reduction,fr
"Hello, world!",hello world,default
ça va?!,ca va,default
$100,$100,default
80 €,80 €,default
test@example.com,test@example.com,default
+1234567890,+1234567890,default
one two three,one two three,default
5:30 pm,5:30 pm,default
d'accord,d accord,fr
qu'il vient,qu il vient,fr
n'est pas,n est pas,fr
l'ordinateur,l ordinateur,fr
m'appelle,m appelle,fr
s'il vous plait,s il vous plait,fr
t'as vu,t as vu,fr
cent euros,100 euros,fr
mille deux cents,1200,fr
cinquante trois,53,fr
contact@exemple.fr,contact arobase exemple point fr,fr
"2 < 5",2 plus petit que 5,fr
50°C,50 degres celsius,fr
ca coute €50,ca coute 50 euros,fr
euh bonjour hein,bonjour,fr
mme dupont,madame dupont,fr
mlle dubois,mademoiselle dubois,fr
dr martin,docteur martin,fr
prof dupont,professeur dupont,fr
st jean,saint jean,fr
ping pong,pingpong,fr
volley ball,volleyball,fr
basket ball,basketball,fr
hand ball,handball,fr
water polo,waterpolo,fr
t shirt,tshirt,fr
cd rom,cdrom,fr
super predateur,superpredateur,fr
"3,14 pi",3 virgule 14 pi,fr
soixante-dix,70,fr
quatre-vingts,80,fr
quatre-vingt-un,81,fr
nonante-neuf,99,fr
septante et un,71,fr
x = 5,x egal a 5,fr
test@example.com,test arobase example point com,fr
bonjour (euh) ami,bonjour ami,fr
ça date d'hier,ca date d hier,fr
18 changes: 18 additions & 0 deletions tests/unit/steps/text/apply_sentence_level_replacements_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from normalization.languages.french import FrenchOperators
from normalization.steps.text.apply_sentence_level_replacements import (
ApplySentenceLevelReplacementsStep,
)

from .conftest import assert_text_step_registered


def test_step_is_registered():
assert_text_step_registered(ApplySentenceLevelReplacementsStep)


def test_apply_sentence_level_replacements_step_french_pour_100(
french_operators: FrenchOperators,
):
text = "pour 100 de réduction"
formatted_text = ApplySentenceLevelReplacementsStep()(text, french_operators)
assert formatted_text == "pourcent de réduction"
6 changes: 6 additions & 0 deletions tests/unit/steps/text/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from normalization.languages.base import LanguageOperators
from normalization.languages.english import EnglishOperators
from normalization.languages.french import FrenchOperators
from normalization.steps import get_step_registry


Expand All @@ -15,6 +16,11 @@ def english_operators():
return EnglishOperators()


@pytest.fixture
def french_operators():
return FrenchOperators()


def assert_text_step_registered(step_cls):
"""Verify a text step is properly registered under its name."""
registry = get_step_registry()
Expand Down
21 changes: 21 additions & 0 deletions tests/unit/steps/text/expand_contractions_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from normalization.languages.english import EnglishOperators
from normalization.languages.french import FrenchOperators
from normalization.steps.text.expand_contractions import ExpandContractionsStep

from .conftest import assert_text_step_registered


def test_step_is_registered():
assert_text_step_registered(ExpandContractionsStep)


def test_expand_contractions_step_english(english_operators: EnglishOperators):
text = "he ain't gonna"
formatted_text = ExpandContractionsStep()(text, english_operators)
assert formatted_text == "he is not going to"


def test_expand_contractions_step_french(french_operators: FrenchOperators):
text = "j'ai dit c'est bien"
formatted_text = ExpandContractionsStep()(text, french_operators)
assert formatted_text == "j'ai dit c'est bien"
Loading
Loading