Reduzieren Sie Risiken und Kosten durch externe Softwaretests

Verbessern Sie die Softwarequalität in allen Projektphasen durch flexible Softwaretests. In einen kostenlosem Check-Up bewerten wir Ihnen mögliche Risiken und ermitteln den Status Quo der Software-Qualitätssicherung in Ihrem Unternehmen.

import click
from .matcher import Text, ExtendedMatch, Matcher
import os
import glob
import csv
import logging
import itertools
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def getFiles(path):
    """
    Determines whether a path is a file or directory.
    If it's a directory, it gets a list of all the text files
    in that directory, recursively. If not, it gets the file.
    """
    if os.path.isfile(path):
        return [path]
    elif os.path.isdir(path):
        // Get list of all files in dir, recursively.
        return glob.glob(path + "/**/*.txt", recursive=True)
    else:
        raise click.ClickException("The path %s doesn't appear to be a file or directory" % path)
def checkLog(logfile, textpair):
    """
    Checks the log file to make sure we haven't already done a particular analysis.
    Returns True if the pair is in the log already.
    """
    pairs = []
    logging.debug('Looking in the log for textpair:' % textpair)
    if not os.path.isfile(logfile):
        logging.debug('No log file found.')
        return None
    with open(logfile, newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            pairs.append([row[0], row[1]])
    // logging.debug('Pairs already in log: %s' % pairs)
    return textpair in pairs
def createLog(logfile, columnLabels):
    """
    Creates a log file and sets up headers so that it can be easily read
    as a CSV later.
    """
    header = ','.join(columnLabels) + '\n'
    with open(logfile, 'w') as f:
        f.write(header)
        f.close
@click.command()
@click.argument('text1')
@click.argument('text2')
@click.option('-t', '--threshold', type=int, default=3, \
        help='The shortest length of match to include in the list of initial matches.')
@click.option('-c', '--cutoff', type=int, default=5, \
        help='The shortest length of match to include in the final list of extended matches.')
@click.option('-n', '--ngrams', type=int, default=3, \
        help='The ngram n-value to match against.')
@click.option('-l', '--logfile', default='log.txt', help='The name of the log file to write to.'
@click.option('--stops', is_flag=True, help='Include stopwords in matching.', default=False)
@click.option('--verbose', is_flag=True, help='Enable verbose mode, giving more information.')
def cli(text1, text2, threshold, cutoff, ngrams, logfile, verbose, stops):
    """ This program finds similar text in two text files. """
    //Determine whether the given path is a file or directory.
    texts1 = getFiles(text1)
    texts2 = getFiles(text2)
    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    if stops:
        logging.debug('Including stopwords in tokenizing.')
    logging.debug('Comparing this/these text(s): %s' % str(texts1))
    logging.debug('with this/these text(s): %s' % str(texts2))
    pairs = list(itertools.product(texts1, texts2))
    numPairs = len(pairs)
    logging.debug('Comparing %s pairs.' % numPairs)
    // logging.debug('List of pairs to compare: %s' % pairs)
    logging.debug('Loading files into memory.')
    texts = {}
    prevTextObjs = {}
    for filename in texts1+texts2:
        with open(filename, errors="ignore") as f:
            text = f.read()
        if filename not in texts:
            texts[filename] = text
    logging.debug('Loading complete.')
    for index, pair in enumerate(pairs):
        timeStart = os.times().elapsed
        logging.debug('Now comparing pair %s of %s.' % (index+1, numPairs))
        logging.debug('Comparing %s with %s.' % (pair[0], pair[1]))
        // Make sure we haven't already done this pair.
        inLog = checkLog(logfile, [pair[0], pair[1]])
        if inLog is None:
            // This means that there isn't a log file. Let's set one up.
            // Set up columns and their labels.
            columnLabels = ['Text A', 'Text B', 'Threshold', 'Cutoff', 'N-Grams', 'Num Matches', 'Text A Length', 'Text B Length', 'Locations in A', 'Locations in B']
            logging.debug('No log file found. Setting one up.')
            createLog(logfile, columnLabels)
        if inLog:
            logging.debug('This pair is already in the log. Skipping.')
            continue
        logging.debug('Processing texts.')
        filenameA, filenameB = pair[0], pair[1]
        textA, textB = texts[filenameA], texts[filenameB]
        // Put this in a dictionary so we don't have to process a file twice.
        for filename in [filenameA, filenameB]:
            if filename not in prevTextObjs:
                logging.debug('Processing text: %s' % filename)
                prevTextObjs[filename] = Text(texts[filename], filename)
        // Just more convenient naming.
        textObjA = prevTextObjs[filenameA]
        textObjB = prevTextObjs[filenameB]
        // Reset the table of previous text objects, so we don't overload memory.
        // This means we'll only remember the previous two texts.
        prevTextObjs =
        // Do the matching.
        myMatch = Matcher(textObjA, textObjB, threshold=threshold, cutoff=cutoff, ngramSize=ngrams, removeStopwords=stops)
        myMatch.match()
        timeEnd = os.times().elapsed
        timeElapsed = timeEnd-timeStart
        logging.debug('Matching completed in %s seconds.' % timeElapsed)
        // Write to the log, but only if a match is found.
        if myMatch.numMatches > 0:
            logItems = [pair[0], pair[1], threshold, cutoff, ngrams, myMatch.numMatches, myMatch.textA.length, myMatch.textB.length, str(myMatch.locationsA), str(myMatch.locationsB)]
            logging.debug('Logging items: %s' % str(logItems))
            line = ','.join(['"%s"' % item for item in logItems]) + '\n'
            f = open(logfile, 'a')
            f.write(line)
            f.close()
if __name__ == '__main__':
    cli()

import click
from .matcher import Text, ExtendedMatch, Matcher
import os
import glob
import csv
import logging
import itertools
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def getFiles(path):
    """
    Determines whether a path is a file or directory.
    If it's a directory, it gets a list of all the text files
    in that directory, recursively. If not, it gets the file.
    """
    if os.path.isfile(path):
        return [path]
    elif os.path.isdir(path):
        // Get list of all files in dir, recursively.
        return glob.glob(path + "/**/*.txt", recursive=True)
    else:
        raise click.ClickException("The path %s doesn't appear to be a file or directory" % path)
def checkLog(logfile, textpair):
    """
    Checks the log file to make sure we haven't already done a particular analysis.
    Returns True if the pair is in the log already.
    """
    pairs = []
    logging.debug('Looking in the log for textpair:' % textpair)
    if not os.path.isfile(logfile):
        logging.debug('No log file found.')
        return None
    with open(logfile, newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            pairs.append([row[0], row[1]])
    // logging.debug('Pairs already in log: %s' % pairs)
    return textpair in pairs
def createLog(logfile, columnLabels):
    """
    Creates a log file and sets up headers so that it can be easily read
    as a CSV later.
    """
    header = ','.join(columnLabels) + '\n'
    with open(logfile, 'w') as f:
        f.write(header)
        f.close
@click.command()
@click.argument('text1')
@click.argument('text2')
@click.option('-t', '--threshold', type=int, default=3, \
        help='The shortest length of match to include in the list of initial matches.')
@click.option('-c', '--cutoff', type=int, default=5, \
        help='The shortest length of match to include in the final list of extended matches.')
@click.option('-n', '--ngrams', type=int, default=3, \
        help='The ngram n-value to match against.')
@click.option('-l', '--logfile', default='log.txt', help='The name of the log file to write to.'
@click.option('--stops', is_flag=True, help='Include stopwords in matching.', default=False)
@click.option('--verbose', is_flag=True, help='Enable verbose mode, giving more information.')
def cli(text1, text2, threshold, cutoff, ngrams, logfile, verbose, stops):
    """ This program finds similar text in two text files. """
    //Determine whether the given path is a file or directory.
    texts1 = getFiles(text1)
    texts2 = getFiles(text2)
    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    if stops:
        logging.debug('Including stopwords in tokenizing.')
    logging.debug('Comparing this/these text(s): %s' % str(texts1))
    logging.debug('with this/these text(s): %s' % str(texts2))
    pairs = list(itertools.product(texts1, texts2))
    numPairs = len(pairs)
    logging.debug('Comparing %s pairs.' % numPairs)
    // logging.debug('List of pairs to compare: %s' % pairs)
    logging.debug('Loading files into memory.')
    texts = {}
    prevTextObjs = {}
    for filename in texts1+texts2:
        with open(filename, errors="ignore") as f:
            text = f.read()
        if filename not in texts:
            texts[filename] = text
    logging.debug('Loading complete.')
    for index, pair in enumerate(pairs):
        timeStart = os.times().elapsed
        logging.debug('Now comparing pair %s of %s.' % (index+1, numPairs))
        logging.debug('Comparing %s with %s.' % (pair[0], pair[1]))
        // Make sure we haven't already done this pair.
        inLog = checkLog(logfile, [pair[0], pair[1]])
        if inLog is None:
            // This means that there isn't a log file. Let's set one up.
            // Set up columns and their labels.
            columnLabels = ['Text A', 'Text B', 'Threshold', 'Cutoff', 'N-Grams', 'Num Matches', 'Text A Length', 'Text B Length', 'Locations in A', 'Locations in B']
            logging.debug('No log file found. Setting one up.')
            createLog(logfile, columnLabels)
        if inLog:
            logging.debug('This pair is already in the log. Skipping.')
            continue
        logging.debug('Processing texts.')
        filenameA, filenameB = pair[0], pair[1]
        textA, textB = texts[filenameA], texts[filenameB]
        // Put this in a dictionary so we don't have to process a file twice.
        for filename in [filenameA, filenameB]:
            if filename not in prevTextObjs:
                logging.debug('Processing text: %s' % filename)
                prevTextObjs[filename] = Text(texts[filename], filename)
        // Just more convenient naming.
        textObjA = prevTextObjs[filenameA]
        textObjB = prevTextObjs[filenameB]
        // Reset the table of previous text objects, so we don't overload memory.
        // This means we'll only remember the previous two texts.
        prevTextObjs =
        // Do the matching.
        myMatch = Matcher(textObjA, textObjB, threshold=threshold, cutoff=cutoff, ngramSize=ngrams, removeStopwords=stops)
        myMatch.match()
        timeEnd = os.times().elapsed
        timeElapsed = timeEnd-timeStart
        logging.debug('Matching completed in %s seconds.' % timeElapsed)
        // Write to the log, but only if a match is found.
        if myMatch.numMatches > 0:
            logItems = [pair[0], pair[1], threshold, cutoff, ngrams, myMatch.numMatches, myMatch.textA.length, myMatch.textB.length, str(myMatch.locationsA), str(myMatch.locationsB)]
            logging.debug('Logging items: %s' % str(logItems))
            line = ','.join(['"%s"' % item for item in logItems]) + '\n'
            f = open(logfile, 'a')
            f.write(line)
            f.close()
if __name__ == '__main__':
    cli()