/**
 * PDF Service
 * Handles extraction of text content from PDF documents.
 * Uses PDF.js for parsing PDF files in the browser.
 */

// Import PDF.js
import * as pdfjsLib from 'pdfjs-dist';
import { isDevEnvironment } from '@/utils/environment';

// Set the worker source - use a dynamic URL based on current host
const workerUrl = window.location.protocol === 'https:' 
  ? `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.js`
  : `http://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.js`;

pdfjsLib.GlobalWorkerOptions.workerSrc = workerUrl;

interface PdfExtractionResult {
  text: string;
  pageCount: number;
  title?: string;
  keywords?: string[];
  structured?: boolean;
  outline?: {
    title: string;
    items: {
      title: string;
      level: number;
    }[];
  };
}

/**
 * PDF Service for extracting text and metadata from PDF files
 */
export class PdfService {
  private extractionCache: Record<string, {
    result: PdfExtractionResult;
    timestamp: number;
  }> = {};

  // Cache expiration time (24 hours)
  private readonly CACHE_EXPIRATION = 24 * 60 * 60 * 1000;

  /**
   * Extract text content from a PDF document
   * @param pdfUrl URL of the PDF file
   * @param maxPages Maximum number of pages to extract (for large PDFs)
   * @returns Extracted text content and metadata
   */
  async extractPdfContent(pdfUrl: string, maxPages: number = 20): Promise<PdfExtractionResult> {
    // Check cache first
    if (
      this.extractionCache[pdfUrl] &&
      Date.now() - this.extractionCache[pdfUrl].timestamp < this.CACHE_EXPIRATION
    ) {
      console.log('Using cached PDF extraction for:', pdfUrl);
      return this.extractionCache[pdfUrl].result;
    }

    try {
      console.log('Extracting content from PDF:', pdfUrl);

      // Load the PDF file
      const loadingTask = pdfjsLib.getDocument(pdfUrl);
      const pdf = await loadingTask.promise;

      // Get document metadata
      const metadata = await pdf.getMetadata().catch(() => null);
      
      // Extract text from each page
      const pageCount = pdf.numPages;
      const pagesToProcess = Math.min(pageCount, maxPages);
      
      let fullText = '';
      const extractedKeywords: string[] = [];
      
      // Process pages (limited to maxPages to prevent massive processing for large docs)
      for (let i = 1; i <= pagesToProcess; i++) {
        const page = await pdf.getPage(i);
        const textContent = await page.getTextContent();
        
        // Extract text from the page
        const pageText = textContent.items
          .map((item: any) => item.str)
          .join(' ');
        
        fullText += `\n--- Page ${i} ---\n${pageText}\n`;
        
        // Extract keywords from first few pages (likely to contain important terms)
        if (i <= 3) {
          // Simple keyword extraction - look for capitalized multi-word phrases
          // or words that appear in headers (larger font)
          const possibleKeywords = pageText.match(/\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/g) || [];
          extractedKeywords.push(...possibleKeywords);
          
          // Also extract terms from items with larger font (likely headers)
          const largerTextItems = textContent.items
            .filter((item: any) => (item.height || 0) > 12)
            .map((item: any) => item.str);
            
          extractedKeywords.push(...largerTextItems);
        }
      }
      
      // Deduplicate keywords
      const uniqueKeywords = [...new Set(extractedKeywords)]
        .filter(keyword => keyword.length > 3)
        .slice(0, 10);
      
      // Try to determine if document has structure
      const hasStructure = fullText.includes('Table of Contents') || 
                          fullText.includes('Index') ||
                          (uniqueKeywords.length >= 5);
      
      // Create extraction result
      const result: PdfExtractionResult = {
        text: fullText.trim(),
        pageCount,
        title: metadata?.info?.Title || undefined,
        keywords: uniqueKeywords,
        structured: hasStructure,
      };
      
      // Add page outline if available
      try {
        const outline = await pdf.getOutline();
        if (outline && outline.length > 0) {
          result.outline = {
            title: metadata?.info?.Title || 'Document Outline',
            items: outline.map((item: any) => ({
              title: item.title,
              level: item.dest ? 1 : 2, // Simplified level detection
            })),
          };
        }
      } catch (outlineError) {
        console.error('Error extracting PDF outline:', outlineError);
      }
      
      // Cache the result
      this.extractionCache[pdfUrl] = {
        result,
        timestamp: Date.now()
      };
      
      return result;
    } catch (error) {
      console.error('Error extracting PDF content:', error);
      
      // Return fallback content in dev environment
      if (isDevEnvironment()) {
        return {
          text: '[This is a development environment placeholder for PDF text extraction]\nThe PDF content would appear here, including page text and any extractable structured information.',
          pageCount: 5,
          keywords: ['sample', 'product', 'mattress', 'warranty', 'catalog'],
          structured: true
        };
      }
      
      // In production, return error message
      throw error;
    }
  }
  
  /**
   * Summarizes PDF content for easier usage in the masterPrompt
   * Creates a concise summary focusing on the most relevant information
   * @param pdfUrl URL of the PDF file
   * @returns Summarized content suitable for the masterPrompt
   */
  async getSummarizedPdfContent(pdfUrl: string): Promise<string> {
    try {
      const extractionResult = await this.extractPdfContent(pdfUrl);
      const { text, pageCount, title, keywords, structured, outline } = extractionResult;
      
      // Create a header with metadata
      let summary = `${title ? `Document Title: ${title}\n` : ''}`;
      summary += `Total Pages: ${pageCount}\n`;
      
      if (keywords && keywords.length > 0) {
        summary += `Key Topics: ${keywords.join(', ')}\n`;
      }
      
      // Add outline if available
      if (outline && outline.items.length > 0) {
        summary += '\nDocument Structure:\n';
        outline.items.slice(0, 10).forEach(item => {
          summary += `${' '.repeat(item.level * 2)}- ${item.title}\n`;
        });
        
        if (outline.items.length > 10) {
          summary += `  (... and ${outline.items.length - 10} more sections)\n`;
        }
      }
      
      // Add extracted text, but limit to a reasonable size for the masterPrompt
      // Start with first few pages
      const textLines = text.split('\n');
      const firstPagesText = textLines.slice(0, Math.min(50, textLines.length)).join('\n');
      
      summary += '\nExtracted Content (first pages):\n';
      summary += firstPagesText;
      
      // If it's a large document, add a sample from middle pages too
      if (pageCount > 5) {
        const middlePageIndex = Math.floor(textLines.length / 2);
        const middlePageSample = textLines
          .slice(middlePageIndex, middlePageIndex + 15)
          .join('\n');
          
        summary += '\n\n[...]\n\nSample from middle of document:\n';
        summary += middlePageSample;
      }
      
      return summary;
    } catch (error) {
      console.error('Error summarizing PDF content:', error);
      return `[Error summarizing PDF content: ${error instanceof Error ? error.message : 'Unknown error'}]`;
    }
  }
  
  /**
   * Determines if the PDF content appears to be a product catalog
   * @param extractionResult Result from PDF extraction
   * @returns Boolean indicating if it looks like a catalog
   */
  isCatalog(extractionResult: PdfExtractionResult): boolean {
    const { text, keywords } = extractionResult;
    const lowercaseText = text.toLowerCase();
    
    // Check for common catalog indicators
    const catalogKeywords = [
      'catalog', 'collection', 'product line', 'specifications', 
      'dimensions', 'prices', 'models', 'options', 'features'
    ];
    
    // Count matches
    const keywordMatches = catalogKeywords.filter(keyword => 
      lowercaseText.includes(keyword)
    ).length;
    
    // Check if it contains product codes or model numbers
    const hasProductCodes = Boolean(text.match(/model\s+\w+\-\d+/i) || 
                                  text.match(/\b[A-Z]{2,}\-\d+\b/));
    
    // Check if it contains pricing information
    const hasPricing = Boolean(text.match(/\$\d+(\.\d{2})?/) || 
                             lowercaseText.includes('price list'));
    
    // Calculate catalog score
    let catalogScore = keywordMatches + (hasProductCodes ? 2 : 0) + (hasPricing ? 2 : 0);
    
    // Boost score if the keywords suggest a catalog
    if (keywords) {
      const keywordScore = keywords.filter(k => 
        catalogKeywords.some(ck => k.toLowerCase().includes(ck))
      ).length;
      
      catalogScore += keywordScore;
    }
    
    return catalogScore >= 3;
  }
} 