Umfangreiche Testmanagement Lösung für komplexe Herausforderungen.
Reduzieren Sie Risiken und Kosten durch externe Softwaretests
Verbessern Sie die Softwarequalität in allen Projektphasen durch flexible Softwaretests.
import click from .matcher import Text, ExtendedMatch, Matcher import os import glob import csv import logging import itertools logger = logging.getLogger() logger.setLevel(logging.INFO) def getFiles(path): """ Determines whether a path is a file or directory. If it's a directory, it gets a list of all the text files in that directory, recursively. If not, it gets the file. """ if os.path.isfile(path): return [path] elif os.path.isdir(path): // Get list of all files in dir, recursively. return glob.glob(path + "/**/*.txt", recursive=True) else: raise click.ClickException("The path %s doesn't appear to be a file or directory" % path) def checkLog(logfile, textpair): """ Checks the log file to make sure we haven't already done a particular analysis. Returns True if the pair is in the log already. """ pairs = [] logging.debug('Looking in the log for textpair:' % textpair) if not os.path.isfile(logfile): logging.debug('No log file found.') return None with open(logfile, newline='') as f: reader = csv.reader(f) for row in reader: pairs.append([row[0], row[1]]) // logging.debug('Pairs already in log: %s' % pairs) return textpair in pairs def createLog(logfile, columnLabels): """ Creates a log file and sets up headers so that it can be easily read as a CSV later. """ header = ','.join(columnLabels) + '\n' with open(logfile, 'w') as f: f.write(header) f.close @click.command() @click.argument('text1') @click.argument('text2') @click.option('-t', '--threshold', type=int, default=3, \ help='The shortest length of match to include in the list of initial matches.') @click.option('-c', '--cutoff', type=int, default=5, \ help='The shortest length of match to include in the final list of extended matches.') @click.option('-n', '--ngrams', type=int, default=3, \ help='The ngram n-value to match against.') @click.option('-l', '--logfile', default='log.txt', help='The name of the log file to write to.' @click.option('--stops', is_flag=True, help='Include stopwords in matching.', default=False) @click.option('--verbose', is_flag=True, help='Enable verbose mode, giving more information.') def cli(text1, text2, threshold, cutoff, ngrams, logfile, verbose, stops): """ This program finds similar text in two text files. """ //Determine whether the given path is a file or directory. texts1 = getFiles(text1) texts2 = getFiles(text2) if verbose: logging.basicConfig(level=logging.DEBUG) if stops: logging.debug('Including stopwords in tokenizing.') logging.debug('Comparing this/these text(s): %s' % str(texts1)) logging.debug('with this/these text(s): %s' % str(texts2)) pairs = list(itertools.product(texts1, texts2)) numPairs = len(pairs) logging.debug('Comparing %s pairs.' % numPairs) // logging.debug('List of pairs to compare: %s' % pairs) logging.debug('Loading files into memory.') texts = {} prevTextObjs = {} for filename in texts1+texts2: with open(filename, errors="ignore") as f: text = f.read() if filename not in texts: texts[filename] = text logging.debug('Loading complete.') for index, pair in enumerate(pairs): timeStart = os.times().elapsed logging.debug('Now comparing pair %s of %s.' % (index+1, numPairs)) logging.debug('Comparing %s with %s.' % (pair[0], pair[1])) // Make sure we haven't already done this pair. inLog = checkLog(logfile, [pair[0], pair[1]]) if inLog is None: // This means that there isn't a log file. Let's set one up. // Set up columns and their labels. columnLabels = ['Text A', 'Text B', 'Threshold', 'Cutoff', 'N-Grams', 'Num Matches', 'Text A Length', 'Text B Length', 'Locations in A', 'Locations in B'] logging.debug('No log file found. Setting one up.') createLog(logfile, columnLabels) if inLog: logging.debug('This pair is already in the log. Skipping.') continue logging.debug('Processing texts.') filenameA, filenameB = pair[0], pair[1] textA, textB = texts[filenameA], texts[filenameB] // Put this in a dictionary so we don't have to process a file twice. for filename in [filenameA, filenameB]: if filename not in prevTextObjs: logging.debug('Processing text: %s' % filename) prevTextObjs[filename] = Text(texts[filename], filename) // Just more convenient naming. textObjA = prevTextObjs[filenameA] textObjB = prevTextObjs[filenameB] // Reset the table of previous text objects, so we don't overload memory. // This means we'll only remember the previous two texts. prevTextObjs = // Do the matching. myMatch = Matcher(textObjA, textObjB, threshold=threshold, cutoff=cutoff, ngramSize=ngrams, removeStopwords=stops) myMatch.match() timeEnd = os.times().elapsed timeElapsed = timeEnd-timeStart logging.debug('Matching completed in %s seconds.' % timeElapsed) // Write to the log, but only if a match is found. if myMatch.numMatches > 0: logItems = [pair[0], pair[1], threshold, cutoff, ngrams, myMatch.numMatches, myMatch.textA.length, myMatch.textB.length, str(myMatch.locationsA), str(myMatch.locationsB)] logging.debug('Logging items: %s' % str(logItems)) line = ','.join(['"%s"' % item for item in logItems]) + '\n' f = open(logfile, 'a') f.write(line) f.close() if __name__ == '__main__': cli()
import click from .matcher import Text, ExtendedMatch, Matcher import os import glob import csv import logging import itertools logger = logging.getLogger() logger.setLevel(logging.INFO) def getFiles(path): """ Determines whether a path is a file or directory. If it's a directory, it gets a list of all the text files in that directory, recursively. If not, it gets the file. """ if os.path.isfile(path): return [path] elif os.path.isdir(path): // Get list of all files in dir, recursively. return glob.glob(path + "/**/*.txt", recursive=True) else: raise click.ClickException("The path %s doesn't appear to be a file or directory" % path) def checkLog(logfile, textpair): """ Checks the log file to make sure we haven't already done a particular analysis. Returns True if the pair is in the log already. """ pairs = [] logging.debug('Looking in the log for textpair:' % textpair) if not os.path.isfile(logfile): logging.debug('No log file found.') return None with open(logfile, newline='') as f: reader = csv.reader(f) for row in reader: pairs.append([row[0], row[1]]) // logging.debug('Pairs already in log: %s' % pairs) return textpair in pairs def createLog(logfile, columnLabels): """ Creates a log file and sets up headers so that it can be easily read as a CSV later. """ header = ','.join(columnLabels) + '\n' with open(logfile, 'w') as f: f.write(header) f.close @click.command() @click.argument('text1') @click.argument('text2') @click.option('-t', '--threshold', type=int, default=3, \ help='The shortest length of match to include in the list of initial matches.') @click.option('-c', '--cutoff', type=int, default=5, \ help='The shortest length of match to include in the final list of extended matches.') @click.option('-n', '--ngrams', type=int, default=3, \ help='The ngram n-value to match against.') @click.option('-l', '--logfile', default='log.txt', help='The name of the log file to write to.' @click.option('--stops', is_flag=True, help='Include stopwords in matching.', default=False) @click.option('--verbose', is_flag=True, help='Enable verbose mode, giving more information.') def cli(text1, text2, threshold, cutoff, ngrams, logfile, verbose, stops): """ This program finds similar text in two text files. """ //Determine whether the given path is a file or directory. texts1 = getFiles(text1) texts2 = getFiles(text2) if verbose: logging.basicConfig(level=logging.DEBUG) if stops: logging.debug('Including stopwords in tokenizing.') logging.debug('Comparing this/these text(s): %s' % str(texts1)) logging.debug('with this/these text(s): %s' % str(texts2)) pairs = list(itertools.product(texts1, texts2)) numPairs = len(pairs) logging.debug('Comparing %s pairs.' % numPairs) // logging.debug('List of pairs to compare: %s' % pairs) logging.debug('Loading files into memory.') texts = {} prevTextObjs = {} for filename in texts1+texts2: with open(filename, errors="ignore") as f: text = f.read() if filename not in texts: texts[filename] = text logging.debug('Loading complete.') for index, pair in enumerate(pairs): timeStart = os.times().elapsed logging.debug('Now comparing pair %s of %s.' % (index+1, numPairs)) logging.debug('Comparing %s with %s.' % (pair[0], pair[1])) // Make sure we haven't already done this pair. inLog = checkLog(logfile, [pair[0], pair[1]]) if inLog is None: // This means that there isn't a log file. Let's set one up. // Set up columns and their labels. columnLabels = ['Text A', 'Text B', 'Threshold', 'Cutoff', 'N-Grams', 'Num Matches', 'Text A Length', 'Text B Length', 'Locations in A', 'Locations in B'] logging.debug('No log file found. Setting one up.') createLog(logfile, columnLabels) if inLog: logging.debug('This pair is already in the log. Skipping.') continue logging.debug('Processing texts.') filenameA, filenameB = pair[0], pair[1] textA, textB = texts[filenameA], texts[filenameB] // Put this in a dictionary so we don't have to process a file twice. for filename in [filenameA, filenameB]: if filename not in prevTextObjs: logging.debug('Processing text: %s' % filename) prevTextObjs[filename] = Text(texts[filename], filename) // Just more convenient naming. textObjA = prevTextObjs[filenameA] textObjB = prevTextObjs[filenameB] // Reset the table of previous text objects, so we don't overload memory. // This means we'll only remember the previous two texts. prevTextObjs = // Do the matching. myMatch = Matcher(textObjA, textObjB, threshold=threshold, cutoff=cutoff, ngramSize=ngrams, removeStopwords=stops) myMatch.match() timeEnd = os.times().elapsed timeElapsed = timeEnd-timeStart logging.debug('Matching completed in %s seconds.' % timeElapsed) // Write to the log, but only if a match is found. if myMatch.numMatches > 0: logItems = [pair[0], pair[1], threshold, cutoff, ngrams, myMatch.numMatches, myMatch.textA.length, myMatch.textB.length, str(myMatch.locationsA), str(myMatch.locationsB)] logging.debug('Logging items: %s' % str(logItems)) line = ','.join(['"%s"' % item for item in logItems]) + '\n' f = open(logfile, 'a') f.write(line) f.close() if __name__ == '__main__': cli()