#!/usr/bin/env python

# file: $PLAGCHECK/src/plagcheck.py
#

# revision history
#
# 20221226 (JP): reviewed
# 20221222 (PM): reviewed and refactored
# 20220202 (JL): initial version
#
# Purpose: Python script to recursively run through directories
# and search for files that match the specified language, to be
# submitted to the Moss plagiarism checker written by Stanford University.
#------------------------------------------------------------------------------

# import system modules
#
import argparse
import os
import subprocess
import sys

# import specialized modules:
#  urllib: a package that collects several modules for working with URLs
#
from urllib.request import urlopen
from urllib.error import *

# import NEDC modules
#  scraper: a wrapper package that integrates several modules that
#           facilitate working with URLs
#
import scraper as scr

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define the program options:
#  note that you cannot separate them by spaces
#
ARG_HELP = "--help"
ARG_ABRV_HELP = "-h"

ARG_DIR = "--directory"
ARG_ABRV_DIR = "-d"

ARG_LANGUAGE = "--language"
ARG_ABRV_LANGUAGE = "-l"

ARG_THRESHOLD = "--threshold"
ARG_ABRV_THRESHOLD = "-t"

# define default argument values
#
MIN_NARGS = int(3)
DEF_SIMILARITY_THRESHOLD = int(75)

# define the languages supported and the associated file extensions
#
DEF_LANG_C = "c"
DEF_LANG_C_EXTS = [".cc",".c",".cpp"]
DEF_LANG_PY = "python"
DEF_LANG_PY_EXTS = ".py"
DEF_LANG_ML = "matlab"
DEF_LANG_ML_EXTS = [".m",".mat"]

# define default argument values
#
DEF_LANGUAGE = DEF_LANG_PY
DEF_DIRECTORY = "./"

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# function: nedc_web_checker
#
# arguments:
#  link: a link to be checked
#
# return: a boolean value indicating status
#
# This function opens the link and returns the result if the link is valid.
# If the link is not valid, it displays an error message.
#
def nedc_web_checker(link):
    
    # Try opening the url
    #
    try:
        html = urlopen(link)

    # exit with error checking
    #
    except HTTPError as e:
        print("Error (%s): HTTP error (%s)" % (__FILE__, e))
        return False
    except URLError as e:
        print("Error (%s): could not find website (%s)" % (__FILE__, e))
        return False
    else:
        return True
#
# end of function

# function: main
#
def main(argv):
    
    # check the number of arguments and display usage file if necessary
    #
    if len(sys.argv) < MIN_NARGS:
        subprocess.call("cat $PLAGCHECK/src/plagcheck.usage", shell= True)
        sys.exit(os.EX_SOFTWARE)

    # create a command line parser
    #
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument(ARG_ABRV_HELP, ARG_HELP, action='store_true')
    parser.add_argument(ARG_ABRV_LANGUAGE, ARG_LANGUAGE, type=str)
    parser.add_argument(ARG_ABRV_DIR, ARG_DIR, type=str)
    parser.add_argument(ARG_ABRV_THRESHOLD, ARG_THRESHOLD,type=int)

    # parse the command line
    #
    args=parser.parse_args()

    # get the parameter values
    #
    if args.language is None:
        args.language = DEF_LANGUAGE

    if args.directory is None:
        args.directory = DEF_DIRECTORY

    if args.threshold == None:
        args.threshold = DEF_SIMILARITY_THRESHOLD

    # check for a help flag or condition
    #
    if args.help:
        subprocess.call("cat $PLAGCHECK/src/plagcheck.help", shell=True)
        sys.exit(os.EX_SOFTWARE)
    
    # create a list of acceptable file extensions for the given language
    #
    if args.language == DEF_LANG_C:
        extensions = DEF_LANG_C_EXTS
    elif args.language == DEF_LANG_PY:
        extensions = DEF_LANG_PY_EXTS
    elif args.language == DEF_LANG_ML:
        extensions = DEF_LANG_ML_EXTS
    else:
        print("Error (%s): the language (%s) is not supported" %
              (__FILE__, args.language))
        sys.exit(os.EX_SOFTWARE)

    # create a list to hold all of the paths to each file
    #
    codeFiles = []
    
    # find all the files by traversing the specified directory
    #
    for root, dirs, files in os.walk(args.directory):
        for file in files:
            if file.endswith(tuple(extensions)):

                # when someone is currently working on a program, there is a 
                # placeholder that always starts with ".#" which creates
                # a problem for the perl script
                #
                if file.startswith('.#'):
                    break
                else:
                    filepath = os.path.join(root, file)
                    codeFiles.append(filepath)

    if not codeFiles:
        print("Error (%s): error opening the directory (%s)" %
              (__FILE__, args.directory))
        sys.exit(os.EX_SOFTWARE)

    # run the perl script and suppresses output
    #
    pipe = subprocess.Popen(["perl", f"{os.environ['PLAGCHECK']}/src/moss.pl",
                            ARG_ABRV_LANGUAGE, args.language,
                            ARG_ABRV_DIR, *codeFiles],
                            stdout=subprocess.PIPE)

    # collect the output of the perl script
    #
    programOutput = pipe.communicate()[0]
    
    # split the output by newline and grab the link from the last line
    #
    lines = programOutput.splitlines()
    final_link = str(lines[-1])

    # The HTML tag adds two characters at the beginning, and one at the end, 
    # so I had to strip them off
    #
    final_link = final_link[2:]
    final_link = final_link[:-1]

    # Check for one of the error codes from the website checker
    #
    if nedc_web_checker(final_link) is False:
        print("Error (%s): %s (%s)" %
              (__FILE__, "could not access the site", final_link))
        sys.exit(os.EX_SOFTWARE)

    # Send the generated link to the web scraper, so that it can be used to 
    # display the data in a user-friendly manner
    #
    scr.nedc_scraper(final_link, args.threshold)

    # exit gracefully
    #
    return True
#
# end of code

# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])

#
# end of file