import pandas as pd
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt


# preprocessing teks (tokenisasi, stopwords, dan stemming)

def preprocess_text(text):
    factory = StopWordRemoverFactory()
    stopword_remover = factory.create_stop_word_remover()
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()] 
    tokens = [stopword_remover.remove(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# sentiment analysis menggunakan Polarity Lexicon

def analyze_sentiment(text):
    positive_words = set(open("positive_words.txt", "r").read().splitlines())
    negative_words = set(open("negative_words.txt", "r").read().splitlines())
    tokens = word_tokenize(text)
    positive_count = sum(1 for word in tokens if word in positive_words)
    negative_count = sum(1 for word in tokens if word in negative_words)
    if positive_count > negative_count:
        return 'positif'
    elif positive_count < negative_count:
        return 'negatif'
    else:
        return 'netral'
    
#  topic modeling menggunakan Latent Dirichlet Allocation (LDA)

def perform_topic_modeling(data):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=None)
    tf = vectorizer.fit_transform(data)
    lda_model = LatentDirichletAllocation(n_components=68, learning_method='online', random_state=42, n_jobs=-1)
    lda_model.fit(tf)
    
    features = vectorizer.get_feature_names_out()
    
    topics = []
    
    for idx, topic in enumerate(lda_model.components_):
        top_features_indices = topic.argsort()[:-10 - 1:-1]
        top_features = [features[i] for i in top_features_indices]
        
        topics.append(top_features)
            
    return topics

def plot_topic_modeling(topics, sentiment):
    plt.figure(figsize=(10, 6))
    plt.barh(list(topics.keys()), list(topics.values()), color='skyblue')
    plt.xlabel('Jumlah Kata')
    plt.ylabel('Kata')
    plt.title(f'Diagram Topic Modeling {sentiment.capitalize()}')
    plt.show()

#EVALUASI MODEL
# Membaca data ulasan dari file CSV
df = pd.read_csv('labeled_15persen.csv')

# Preprocessing teks
df['Preprocessed_Ulasan'] = df['Ulasan'].apply(preprocess_text)

# Ekstraksi fitur menggunakan LDA
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=None)
tf = vectorizer.fit_transform(df['Preprocessed_Ulasan'])
lda_model = LatentDirichletAllocation(n_components=68, learning_method='online', random_state=42, n_jobs=-1)
lda_model.fit(tf)

# Latih model klasifikasi 
X = tf
y = df['Sentimen']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Prediksi menggunakan model klasifikasi
y_pred = classifier.predict(X_test)

# Hitung metrik evaluasi
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print("Hasil Split data 15%")
print("Akurasi:", accuracy)
print("Presisi:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


