import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

# Importer le fichier CSV dans un DataFrame
df = pd.read_csv('bank_statements.csv')

# Correction orthographique approximative
def correct_spelling(label):
    suggestions = wordnet.synsets(label)
    if suggestions:
        return max(suggestions, key=lambda s: s.lemma_names('eng')).lemmas()[0].name()
    else:
        return label

# Appliquer la correction orthographique
df['Label'] = df['Label'].apply(correct_spelling)

# Vectoriser les labels de transaction
vectorizer = CountVectorizer()
labels = df['Label']
X = vectorizer.fit_transform(labels)

# Diviser les données en jeu d'entraînement et jeu de test
X_train, X_test, y_train, y_test = train_test_split(X, labels, random_state=42, test_size=0.2)

# Former le classificateur naïf Bayésien multinomial
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Évaluer le classificateur
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification accuracy: {accuracy:.2f}')

# Catégoriser les dépenses
categories = set(df['Label'])
category_mapping = dict((cat, i) for i, cat in enumerate(categories))
df['Category'] = df['Label'].map(category_mapping)

df.head()