Source code for orangearg.argument.miner.processor

"""Argument processor module."""

from typing import List, Dict, Tuple
import copy
import math
from collections import defaultdict

import numpy as np
import pandas as pd



[docs]
def _match_list_size(*args: List):
    """With an arbitrary number of lists as input, check if they are in the same size."""
    if any(len(arg) != len(args[0]) for arg in args):
        raise ValueError(f"Input size not match: {args}.")




[docs]
def _aggregate_list_by_another(keys: List, values: List) -> Dict:
    """Aggregate a list according to elements of another list.

    Args:
        keys (List): The group keys.
        values (List): The list to be aggregated.

    Returns:
        Dict: The aggregation result.
    """
    result = defaultdict(list)
    for i, key in enumerate(keys):
        result[key].append(values[i])
    return result




[docs]
def get_argument_topics(arg_ids: List[int], topics: List[int]) -> List[Tuple[int]]:
    """Get argument topics.

    The topics of an argument is a combination of the topics of all chunks that belong to this argument. Duplications are not removed, and the reason behind is that duplications can be treated as a sign of topic importance. Also, even though two chunks can belong to the same topic, they could still have different ranks within an argument.

    Args:
        arg_ids (List[int]): the argument ids of chunks.
        topics (List[int]): the topic indices of chunks.

    Returns:
        List[list[int]]: list of argument topics, which is also a list containing topic indices of chunks belonging to this argument.
    """
    _match_list_size(arg_ids, topics)
    result = _aggregate_list_by_another(keys=arg_ids, values=topics)
    result = result.values()
    return [tuple(r) for r in result]




[docs]
def get_argument_sentiment(
    arg_ids: List[int],
    ranks: List[float],
    p_scores: List[float],
    min_sent: int = -1,
    max_sent: int = 1,
) -> List[float]:
    """Get argument sentiment score.

    The sentiment score of an argument is calculated as a weighted sum of sentiment scores of chunks belonging to this argument, where weights are ranks of the chunks. The result score is then normalized into range [0, 1].

    Args:
        arg_ids (List[int]): the argument ids of chunks.
        ranks (List[float]): the pagerank of chunks within arguments.
        p_scores (List[float]): the sentiment polarity scores of chunks.
        min_sent (int): minimun of argument sentiment before normalization. Defaults to -1.
        max_sent (int): maximum of argument sentiment before normalization. Defaults to 1.

    Returns:
        List[float]: List of argument sentiment scores, which are floats in range [0, 1].
    """
    _match_list_size(arg_ids, ranks, p_scores)

    grouped_ranks = _aggregate_list_by_another(keys=arg_ids, values=ranks)
    grouped_p_scores = _aggregate_list_by_another(keys=arg_ids, values=p_scores)

    sentiments = []
    for arg_id, rank in grouped_ranks.items():
        p_score = grouped_p_scores[arg_id]
        sentiment = np.dot(rank, p_score)
        sentiment = (sentiment - min_sent) / (max_sent - min_sent)
        sentiments.append(sentiment)
    return sentiments




[docs]
def get_argument_coherence(
    scores: List[int],
    sentiments: List[float],
    min_score: int = 1,
    max_score: int = 5,
    variance: float = 0.2,
) -> List[float]:
    """Get argument coherence.

    Coherence is computed as inversed difference between sentiments and overall scores. Overall scores are first normalized into the same range as argument sentiments, which is [0, 1]. Then their differences are computed and applied a Gaussian kernal to invert and scale the differences to [0, 1].

    Args:
        scores (List[int]): List of argument overall scores.
        sentiments (List[float]): List of argument sentiment scores.
        min_score (int, optional): Lower bound of scores. Defaults to 1.
        max_score (int, optional): Upper bound of scores. Defaults to 5.
        variance (float): variance of the Gaussian kernal.

    Returns:
        List[float]: List of argument coherence scores, in range of (0, 1]
    """
    _match_list_size(sentiments, scores)

    range_score = max_score - min_score
    scores = [(s - min_score) / range_score for s in scores]

    def gaussian(x):
        """Gaussian activation function."""
        return math.e ** (-(x**2) / (2 * variance))

    coherences = [sentiments[i] - scores[i] for i in range(len(scores))]
    coherences = list(map(gaussian, coherences))
    return coherences




[docs]
def update_argument_table(
    df_arguments: pd.DataFrame,
    topics: List[List[int]],
    sentiments: List[float],
    coherences: List[float],
) -> pd.DataFrame:
    """Return a copy of argument dataframe, with new columns of argument topics, sentiments, and coherences.

    Args:
        df_arguments (pd.DataFrame): argument dataframe.
        topics (List[List[int]]): list of argument topics
        sentiments (List[float]): list of argument sentiment scores
        coherences (List[float]): list of argument coherence scores

    Returns:
        pd.DataFrame: _description_
    """
    df_copy = copy.deepcopy(df_arguments)
    df_copy["topics"] = topics
    df_copy["sentiment"] = sentiments
    df_copy["coherence"] = coherences
    return df_copy