#!/usr/bin/env python # file: $PLAGCHECK/src/scraper.py # # revision history # # 20221226 (JP): reviewed # 20221222 (PM): reviewed and refactored # 20220202 (JL): initial version # # Purpose: a function that takes a moss.pl link as an argument, # and displays the info from the plagiarism checker in a user-friendly manner. #------------------------------------------------------------------------------ # import system modules # from bs4 import BeautifulSoup import os import re import requests import sys #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # regex constants # RE_FILE_NAME = re.compile(r"[^//] file: (.*)", re.IGNORECASE) RE_GET_NUM = '[^A-Za-z0-9]+' # HTML elements constants # A = "a" BS_HTMLP = "html.parser" FRAME = "frame" HREF = "href" PRE = "pre" SRC = "src" #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ # function: nedc_scraper # # argument: # file_link: the overview link given by moss [str] # threshold: the threshold to compare the similarity to [int] # # return: a boolean value indicating status # # this function opens the link and scrape the information then prints it out. # def nedc_scraper(file_link, threshold): # Hold the URL from the Moss server # link = requests.get(file_link) # convert the link given to an HTML document # htmltext = link.text # use BeautifulSoup's HTML parser on the HTML file # soup = BeautifulSoup(htmltext, BS_HTMLP) allatags = soup.find_all(A) # first 6 tags always are the same data, will not ever be used # allatags = allatags[6:] # instantiate the pair with two File objects # pair = (WorkFile(), WorkFile()) pair_count = 1 # checks every other list in the element, so that the second file # in check 1 is not compared to the first file in check 2 # for i in range(0, len(allatags), 2): # check if there's a link # if not allatags[i].has_attr(HREF): print("Error (%s): allatags does not a link to the result (%s)" % __FILE__, alltags[i]) return False # access each of the individual link # link = allatags[i][HREF] response = requests.get(link) frame = BeautifulSoup(response.text, BS_HTMLP).find_all(FRAME) # pop current link so that we don't need to visit it # frame.pop(0) # since we are comparing two files, there are two links that show the # student's code block # for ind, f in enumerate(frame): response = requests.get(file_link + "/" + f[SRC]) code_block = BeautifulSoup(response.text, BS_HTMLP).find_all(PRE) # get the student's filename by regex # file_name = re.findall(RE_FILE_NAME, str(code_block[0])) pair[ind].file_name = file_name[0] pair[ind].base_name = file_name[0].split(".")[0] # get the directory and similar percentages measurement # dir, percentage = allatags[i].string.split() dir_1, percentage_1 = allatags[i + 1].string.split() # convert the percentage into a number so that we can compare it to the # threshold # percentage = int(re.sub(RE_GET_NUM, '', percentage)) percentage_1 = int(re.sub(RE_GET_NUM, '', percentage_1)) # update the corresponding file class object # pair[0].file_dir = dir + pair[0].file_name pair[0].percentage = percentage pair[1].file_dir = dir_1 + pair[1].file_name pair[1].percentage = percentage_1 # check against the threshold # if (int(pair[0].percentage) > threshold or int(pair[1].percentage) > threshold): # pretty printing # print(f"==============================") print(f"| Pair {pair_count:02} | ") print(f"==============================") print() print(f"File 1: {pair[0].file_dir} ({str(pair[0].percentage)}%)") print(f"File 2: {pair[1].file_dir} ({str(pair[1].percentage)}%)") print() print(f"Link: {link}".rjust(10)) print() pair_count += 1 print(f"==============================") print(f"| End | ") print(f"==============================") print() print("Overview Link: %s" % file_link) # exit gracefully # return True #------------------------------------------------------------------------------ # # classes are listed here # #------------------------------------------------------------------------------ # class: WorkFile # # This class keeps track of the student's work information. # class WorkFile: # method: WorkFile::constructor # # arguments: none # # returns: none # def __init__(self) -> None: self.file_name = "" self.base_name = "" self.file_dir = "" self.percentage = 0 # # end of method # # end of WorkFile # # end of file