#!/usr/bin/env python

# file: $PLAGCHECK/src/scraper.py
#

# revision history
#
# 20221226 (JP): reviewed
# 20221222 (PM): reviewed and refactored
# 20220202 (JL): initial version
#
# Purpose: a function that takes a moss.pl link as an argument,
# and displays the info from the plagiarism checker in a user-friendly manner.
#------------------------------------------------------------------------------

# import system modules
#
from bs4 import BeautifulSoup
import os
import re
import requests
import sys

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# regex constants
#
RE_FILE_NAME = re.compile(r"[^//] file: (.*)", re.IGNORECASE)
RE_GET_NUM = '[^A-Za-z0-9]+'

# HTML elements constants
#
A = "a"
BS_HTMLP = "html.parser"
FRAME = "frame"
HREF = "href"
PRE = "pre"
SRC = "src"

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# function: nedc_scraper
#
# argument:
#   file_link: the overview link given by moss [str]
#   threshold: the threshold to compare the similarity to [int]
#
# return: a boolean value indicating status
#
# this function opens the link and scrape the information then prints it out.
#
def nedc_scraper(file_link, threshold):

    # Hold the URL from the Moss server
    #
    link = requests.get(file_link)
    
    # convert the link given to an HTML document
    #
    htmltext = link.text

    # use BeautifulSoup's HTML parser on the HTML file
    #
    soup = BeautifulSoup(htmltext, BS_HTMLP)
    allatags = soup.find_all(A)
    
    # first 6 tags always are the same data, will not ever be used
    #
    allatags = allatags[6:]
    
    # instantiate the pair with two File objects
    #
    pair = (WorkFile(), WorkFile())
    pair_count = 1

    # checks every other list in the element, so that the second file
    # in check 1 is not compared to the first file in check 2
    #
    for i in range(0, len(allatags), 2):

        # check if there's a link 
        #
        if not allatags[i].has_attr(HREF):
            print("Error (%s): allatags does not a link to the result (%s)" %
                  __FILE__, alltags[i])
            return False

        # access each of the individual link
        #
        link = allatags[i][HREF]
        response = requests.get(link)
        frame = BeautifulSoup(response.text, BS_HTMLP).find_all(FRAME)
        
        # pop current link so that we don't need to visit it
        #
        frame.pop(0)

        # since we are comparing two files, there are two links that show the 
        # student's code block
        #
        for ind, f in enumerate(frame):
            
            response = requests.get(file_link + "/" + f[SRC])
            code_block = BeautifulSoup(response.text, BS_HTMLP).find_all(PRE)
            
            # get the student's filename by regex
            #
            file_name = re.findall(RE_FILE_NAME, str(code_block[0]))
            pair[ind].file_name = file_name[0]
            pair[ind].base_name = file_name[0].split(".")[0]

        # get the directory and similar percentages measurement
        #
        dir, percentage = allatags[i].string.split()
        dir_1, percentage_1 = allatags[i + 1].string.split()

        # convert the percentage into a number so that we can compare it to the
        # threshold
        #
        percentage = int(re.sub(RE_GET_NUM, '', percentage))
        percentage_1 = int(re.sub(RE_GET_NUM, '', percentage_1))

        # update the corresponding file class object
        #
        pair[0].file_dir = dir + pair[0].file_name
        pair[0].percentage = percentage

        pair[1].file_dir = dir_1 + pair[1].file_name
        pair[1].percentage = percentage_1

        # check against the threshold
        #
        if (int(pair[0].percentage) > threshold or 
            int(pair[1].percentage) > threshold):

            # pretty printing
            #
            print(f"==============================")
            print(f"|           Pair {pair_count:02}          | ")
            print(f"==============================")
            print()
            print(f"File 1: {pair[0].file_dir} ({str(pair[0].percentage)}%)")
            print(f"File 2: {pair[1].file_dir} ({str(pair[1].percentage)}%)")
            print()
            print(f"Link: {link}".rjust(10))
            print()

            pair_count += 1

    print(f"==============================")
    print(f"|            End             | ")
    print(f"==============================")
    print()
    print("Overview Link: %s" % file_link)

    # exit gracefully
    #
    return True

#------------------------------------------------------------------------------
#
# classes are listed here
#
#------------------------------------------------------------------------------

# class: WorkFile
#
# This class keeps track of the student's work information.
#
class WorkFile:

    # method: WorkFile::constructor
    #
    # arguments: none
    #
    # returns: none
    #
    def __init__(self) -> None:
        self.file_name = ""
        self.base_name = ""
        self.file_dir = ""
        self.percentage = 0

    #
    # end of method

#
# end of WorkFile

#
# end of file