import bs4
import shutil
import json
import requests
from langchain import hub
#from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
#from langchain_community.document_loaders import WebBaseLoader
#from langchain_community.vectorstores import Chroma
#from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.globals import set_verbose
import os
import glob
from typing import List
from multiprocessing import Pool
from tqdm import tqdm
#import requests
#from docx import Document
#from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage
from flask import Flask, request, jsonify
from langchain_community.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PyMuPDFLoader,
    TextLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.docstore.document import Document
#from langchain_community.llms import OpenAI
#from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts.prompt import PromptTemplate

CURR_DIR = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
PARENT_DIR = os.path.dirname(CURR_DIR)

APP_DIR = CURR_DIR + "/"

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

app = Flask(__name__)

LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    # ".docx": (Docx2txtLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PyMuPDFLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
    # Add more mappings for other file extensions and loaders as needed
}

def load_single_document(file_path: str) -> List[Document]:
    ext = "." + file_path.rsplit(".", 1)[-1].lower()
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()

    raise ValueError(f"Unsupported file extension '{ext}'")

def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
    all_files = []
    for ext in LOADER_MAPPING:
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
        )
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
        )
    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]

    with Pool(processes=os.cpu_count()) as pool:
        results = []
        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
                results.extend(docs)
                pbar.update()

    return results

def process_documents(source_directory, ignored_files: List[str] = []) -> List[Document]:
    print(f"Loading documents from {source_directory}")
    print(f"Loading documents into {APP_DIR}")
    documents = load_documents(source_directory, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from {source_directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    documents = text_splitter.split_documents(documents)
    print(f"Split into {len(documents)} chunks of text (max. {CHUNK_SIZE} tokens each)")
    return documents

def download_document(url: str, download_dir: str) -> str:
    local_filename = os.path.join(download_dir, url.split('/')[-1])
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

@app.route('/', methods=['GET'])
def checkapi():
    try:
        return jsonify({'message': 'App is ruuning on port - 5000.', 'status' : 'success'})
    except Exception as e:
        return jsonify({'error': str(e)})

@app.route('/save_embeddings', methods=['POST'])
def save_embeddings_to_db():
    try:
        data = request.get_json()
        doc_urls = data['doc_urls']
        api_key = data['api_key']
        delete_folder = data['delete_folder']
        folder_name = data['folder']
        emb_folder = APP_DIR + "chromadb/" + delete_folder
        download_dir = APP_DIR + "downloads/"
        
        # Create download directory if it doesn't exist
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)
        
        print(APP_DIR)
        if os.path.exists(emb_folder):
            shutil.rmtree(emb_folder)
            print("Directory '{}' deleted successfully.".format(emb_folder))
        
        # Download documents
        downloaded_files = []
        for url in doc_urls:
            print(url)
            downloaded_files.append(download_document(url, download_dir))
        
        # Process documents
        documents = load_documents(download_dir, ignored_files=[])
        if not documents:
            print("No new documents to load")
            exit(0)
        print(f"Loaded {len(downloaded_files)} new documents from URLs")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
        documents = text_splitter.split_documents(documents)
        print(f"Split into {len(documents)} chunks of text (max. {CHUNK_SIZE} tokens each)")
        
        # Create embeddings and save them
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=api_key)
        vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=APP_DIR + "chromadb/" + folder_name)
        vectordb.persist()
        
        # Clean up downloaded files
        shutil.rmtree(download_dir)
        
        return jsonify({'message': 'Embeddings processed and saved', 'status': 'success'})
    except Exception as e:
        return jsonify({'error': str(e)})
    
@app.route('/get_responses', methods=['POST'])
def get_responses():
    try:
        set_verbose(True)
        data = request.get_json()
        query = data.get('query', '') 
        api_key = data.get('api_key', '') 
        context = data.get('context', '') 
        instructions_json = data.get('instructions', '[]')
        instructions = json.loads(instructions_json)
        question = query
        persist_dir = data.get('folder')

        llm = ChatOpenAI(model="gpt-4-turbo", temperature=1.0, openai_api_key=api_key)
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=api_key)
        vectorstore = Chroma(persist_directory = APP_DIR + "chromadb/" + persist_dir, embedding_function=embeddings)
        retriever = vectorstore.as_retriever()

        contextualize_q_system_prompt = """Given a chat history and the latest user question \
        which might reference context in the chat history, formulate a standalone question \
        which can be understood without the chat history. Do NOT answer the question, \
        just reformulate it if needed and otherwise return it as is.\
        Kindly present the response in HTML tags format. \
        If asked for points or steps or details of anything, give answer in html ol or ul tags. 
        """
        contextualize_q_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", contextualize_q_system_prompt),
                MessagesPlaceholder(variable_name="chat_history"),
                ("human", "{question}"),
            ]
        )
        contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()

        qa_instructions = ""
        for instruction in instructions:
            qa_instructions += f"{instruction}\n"

        qa_system_prompt = ""
        qa_system_prompt += """\nYou are an assistant for question-answering tasks."""
        qa_system_prompt += """\nKindly greet if user greets you."""
        qa_system_prompt += """\nUse the following pieces of retrieved context to answer the question."""
        qa_system_prompt += """\nKindly present the response in HTML tags format."""
        qa_system_prompt += """\nUse <br> as line seperator and eliminate '\\n'."""
        qa_system_prompt += """\nAlways Show Image Reference if given to response using <img> tag if given in the document."""
        qa_system_prompt += """\nIf asked for points or steps or details of anything, give answer in html <ol> or <ul> tags."""

        for instruction in instructions:
            qa_system_prompt += f"\n{instruction}"

        qa_system_prompt += """\n{context}"""

        print(qa_system_prompt)

        qa_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", qa_system_prompt),
                MessagesPlaceholder(variable_name="chat_history"),
                ("human", "{question}"),
            ]
        )

        def contextualized_question(input: dict):
            if input.get("chat_history"):
                return contextualize_q_chain
            else:
                return input["question"]
            
        rag_chain = (
            RunnablePassthrough.assign(
                context=contextualized_question | retriever
            )
            | qa_prompt
            | llm
        )

        chat_history = []

        for item in context:
            chat_history.extend([HumanMessage(content=item.get('question')), AIMessage(content=item.get('answer'))])

        ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
        print(ai_msg)
        return str(ai_msg.content)
        
    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == '__main__':
    app.run(debug=True)