Class `swarmauri_standard.embeddings.TfidfEmbedding.TfidfEmbedding`

swarmauri_standard.embeddings.TfidfEmbedding.TfidfEmbedding

TfidfEmbedding(**kwargs)

Bases: EmbeddingBase

Source code in swarmauri_standard/embeddings/TfidfEmbedding.py

def __init__(self, **kwargs):
    super().__init__(**kwargs)
    # Initialize our internal attributes.
    self._features = []  # This will hold our vocabulary.
    self._idf = {}  # This will hold the computed idf for each token.
    self._fit_matrix = []  # This will hold the TF-IDF vectors.

type `class-attribute` `instance-attribute`

type = 'TfidfEmbedding'

model_config `class-attribute` `instance-attribute`

model_config = ConfigDict(
    extra="allow", arbitrary_types_allowed=True
)

id `class-attribute` `instance-attribute`

id = Field(default_factory=generate_id)

members `class-attribute` `instance-attribute`

members = None

owners `class-attribute` `instance-attribute`

owners = None

host `class-attribute` `instance-attribute`

host = None

default_logger `class-attribute`

default_logger = None

logger `class-attribute` `instance-attribute`

logger = None

name `class-attribute` `instance-attribute`

name = None

resource `class-attribute` `instance-attribute`

resource = Field(default=EMBEDDING.value, frozen=True)

version `class-attribute` `instance-attribute`

version = '0.1.0'

extract_features

extract_features()

Returns the list of features (vocabulary terms) that were extracted during fitting.

Source code in swarmauri_standard/embeddings/TfidfEmbedding.py

def extract_features(self) -> List[str]:
    """
    Returns the list of features (vocabulary terms) that were extracted during fitting.
    """
    return self._features

fit

fit(documents)

Fits the TF-IDF model on the provided documents. It computes the vocabulary, document frequencies, idf values, and the TF-IDF vectors for each document.

Source code in swarmauri_standard/embeddings/TfidfEmbedding.py

def fit(self, documents: List[str]) -> None:
    """
    Fits the TF-IDF model on the provided documents.
    It computes the vocabulary, document frequencies, idf values, and the TF-IDF
    vectors for each document.
    """
    N = len(documents)
    df = defaultdict(int)
    tokenized_docs = []

    # Tokenize documents and compute document frequency for each token.
    for doc in documents:
        # Simple tokenization: lowercasing and splitting on whitespace.
        tokens = doc.lower().split()
        tokenized_docs.append(tokens)
        for token in set(tokens):  # use set() to count each token once per doc
            df[token] += 1

    # Build a sorted vocabulary for consistent vector ordering.
    self._features = sorted(list(df.keys()))

    # Compute idf for each term using the formula: log(N / df)
    self._idf = {token: math.log(N / df[token]) for token in self._features}

    # Now compute the TF-IDF vector for each document.
    self._fit_matrix = []
    for tokens in tokenized_docs:
        tf = Counter(tokens)
        doc_len = len(tokens)
        vector = []
        for token in self._features:
            # Compute term frequency (TF) for the token in this document.
            tf_value = tf[token] / doc_len if doc_len > 0 else 0.0
            # Multiply by idf to get the TF-IDF weight.
            tfidf_value = tf_value * self._idf[token]
            vector.append(tfidf_value)
        self._fit_matrix.append(vector)

fit_transform

fit_transform(documents)

Fits the model on the provided documents and returns the TF-IDF vectors as a list of Vector instances.

Source code in swarmauri_standard/embeddings/TfidfEmbedding.py

def fit_transform(self, documents: List[str]) -> List[Vector]:
    """
    Fits the model on the provided documents and returns the TF-IDF vectors as a list
    of Vector instances.
    """
    self.fit(documents)
    return [Vector(value=vec) for vec in self._fit_matrix]

transform

transform(documents)

Transforms new documents into TF-IDF vectors using the vocabulary and idf values computed during fitting. Any term not in the vocabulary is ignored.

Source code in swarmauri_standard/embeddings/TfidfEmbedding.py

def transform(self, documents: List[str]) -> List[Vector]:
    """
    Transforms new documents into TF-IDF vectors using the vocabulary and idf values
    computed during fitting. Any term not in the vocabulary is ignored.
    """
    if not self._features or not self._idf:
        raise ValueError(
            "The model has not been fitted yet. Please call fit first."
        )

    transformed_vectors = []
    for doc in documents:
        tokens = doc.lower().split()
        tf = Counter(tokens)
        doc_len = len(tokens)
        vector = []
        for token in self._features:
            tf_value = tf[token] / doc_len if doc_len > 0 else 0.0
            # If the token is not in the fitted vocabulary, its idf defaults to 0.
            idf_value = self._idf.get(token, 0.0)
            vector.append(tf_value * idf_value)
        transformed_vectors.append(Vector(value=vector))
    return transformed_vectors

infer_vector

infer_vector(data, documents)

Infers a TF-IDF vector for a new document. In this implementation, we append the new document to the provided corpus, re-fit the model, and return the vector for the new document. (Note: This re-fits the model which might be inefficient for production but mirrors the original logic.)

Source code in swarmauri_standard/embeddings/TfidfEmbedding.py

def infer_vector(self, data: str, documents: List[str]) -> Vector:
    """
    Infers a TF-IDF vector for a new document. In this implementation, we append the
    new document to the provided corpus, re-fit the model, and return the vector for
    the new document. (Note: This re-fits the model which might be inefficient for
    production but mirrors the original logic.)
    """
    documents.append(data)
    vectors = self.fit_transform(documents)
    return vectors[-1]

save_model

save_model(path)

Saves the TF-IDF model (i.e. the vocabulary and idf values) to the specified path using joblib.

Source code in swarmauri_standard/embeddings/TfidfEmbedding.py

def save_model(self, path: str) -> None:
    """
    Saves the TF-IDF model (i.e. the vocabulary and idf values) to the specified path
    using joblib.
    """
    model_data = {
        "features": self._features,
        "idf": self._idf,
    }
    joblib.dump(model_data, path)

load_model

load_model(path)

Loads a TF-IDF model (i.e. the vocabulary and idf values) from the specified path using joblib.

Source code in swarmauri_standard/embeddings/TfidfEmbedding.py

def load_model(self, path: str) -> None:
    """
    Loads a TF-IDF model (i.e. the vocabulary and idf values) from the specified path
    using joblib.
    """
    model_data = joblib.load(path)
    self._features = model_data.get("features", [])
    self._idf = model_data.get("idf", {})

register_model `classmethod`

register_model()

Decorator to register a base model in the unified registry.

RETURNS	DESCRIPTION
`Callable`	A decorator function that registers the model class. TYPE: `Callable[[Type[BaseModel]], Type[BaseModel]]`

Source code in swarmauri_base/DynamicBase.py

@classmethod
def register_model(cls) -> Callable[[Type[BaseModel]], Type[BaseModel]]:
    """
    Decorator to register a base model in the unified registry.

    Returns:
        Callable: A decorator function that registers the model class.
    """

    def decorator(model_cls: Type[BaseModel]):
        """Register ``model_cls`` as a base model."""
        model_name = model_cls.__name__
        if model_name in cls._registry:
            glogger.warning(
                "Model '%s' is already registered; skipping duplicate.", model_name
            )
            return model_cls

        cls._registry[model_name] = {"model_cls": model_cls, "subtypes": {}}
        glogger.debug("Registered base model '%s'.", model_name)
        DynamicBase._recreate_models()
        return model_cls

    return decorator

register_type `classmethod`

register_type(resource_type=None, type_name=None)

Decorator to register a subtype under one or more base models in the unified registry.

PARAMETER	DESCRIPTION
`resource_type`	The base model(s) under which to register the subtype. If None, all direct base classes (except DynamicBase) are used. TYPE: `Optional[Union[Type[T], List[Type[T]]]]` DEFAULT: `None`
`type_name`	An optional custom type name for the subtype. TYPE: `Optional[str]` DEFAULT: `None`

RETURNS	DESCRIPTION
`Callable`	A decorator function that registers the subtype. TYPE: `Callable[[Type[DynamicBase]], Type[DynamicBase]]`

Source code in swarmauri_base/DynamicBase.py

@classmethod
def register_type(
    cls,
    resource_type: Optional[Union[Type[T], List[Type[T]]]] = None,
    type_name: Optional[str] = None,
) -> Callable[[Type["DynamicBase"]], Type["DynamicBase"]]:
    """
    Decorator to register a subtype under one or more base models in the unified registry.

    Parameters:
        resource_type (Optional[Union[Type[T], List[Type[T]]]]):
            The base model(s) under which to register the subtype. If None, all direct base classes (except DynamicBase)
            are used.
        type_name (Optional[str]): An optional custom type name for the subtype.

    Returns:
        Callable: A decorator function that registers the subtype.
    """

    def decorator(subclass: Type["DynamicBase"]):
        """Register ``subclass`` as a subtype."""
        if resource_type is None:
            resource_types = [
                base for base in subclass.__bases__ if base is not cls
            ]
        elif not isinstance(resource_type, list):
            resource_types = [resource_type]
        else:
            resource_types = resource_type

        for rt in resource_types:
            if not issubclass(subclass, rt):
                raise TypeError(
                    f"'{subclass.__name__}' must be a subclass of '{rt.__name__}'."
                )
            final_type_name = type_name or getattr(
                subclass, "_type", subclass.__name__
            )
            base_model_name = rt.__name__

            if base_model_name not in cls._registry:
                cls._registry[base_model_name] = {"model_cls": rt, "subtypes": {}}
                glogger.debug(
                    "Created new registry entry for base model '%s'.",
                    base_model_name,
                )

            subtypes_dict = cls._registry[base_model_name]["subtypes"]
            if final_type_name in subtypes_dict:
                glogger.warning(
                    "Type '%s' already exists under '%s'; skipping duplicate.",
                    final_type_name,
                    base_model_name,
                )
                continue

            subtypes_dict[final_type_name] = subclass
            glogger.debug(
                "Registered '%s' as '%s' under '%s'.",
                subclass.__name__,
                final_type_name,
                base_model_name,
            )

        DynamicBase._recreate_models()
        return subclass

    return decorator

model_validate_toml `classmethod`

model_validate_toml(toml_data)

Validate a model from a TOML string.

Source code in swarmauri_base/TomlMixin.py

@classmethod
def model_validate_toml(cls, toml_data: str):
    """Validate a model from a TOML string."""
    try:
        # Parse TOML into a Python dictionary
        toml_content = tomllib.loads(toml_data)

        # Convert the dictionary to JSON and validate using Pydantic
        return cls.model_validate_json(json.dumps(toml_content))
    except tomllib.TOMLDecodeError as e:
        raise ValueError(f"Invalid TOML data: {e}")
    except ValidationError as e:
        raise ValueError(f"Validation failed: {e}")

model_dump_toml

model_dump_toml(
    fields_to_exclude=None, api_key_placeholder=None
)

Return a TOML representation of the model.

Source code in swarmauri_base/TomlMixin.py

def model_dump_toml(self, fields_to_exclude=None, api_key_placeholder=None):
    """Return a TOML representation of the model."""
    if fields_to_exclude is None:
        fields_to_exclude = []

    # Load the JSON string into a Python dictionary
    json_data = json.loads(self.model_dump_json())

    # Function to recursively remove specific keys and handle api_key placeholders
    def process_fields(data, fields_to_exclude):
        """Recursively filter fields and apply placeholders."""
        if isinstance(data, dict):
            return {
                key: (
                    api_key_placeholder
                    if key == "api_key" and api_key_placeholder is not None
                    else process_fields(value, fields_to_exclude)
                )
                for key, value in data.items()
                if key not in fields_to_exclude
            }
        elif isinstance(data, list):
            return [process_fields(item, fields_to_exclude) for item in data]
        else:
            return data

    # Filter the JSON data
    filtered_data = process_fields(json_data, fields_to_exclude)

    # Convert the filtered data into TOML
    return toml.dumps(filtered_data)

model_validate_yaml `classmethod`

model_validate_yaml(yaml_data)

Validate a model from a YAML string.

Source code in swarmauri_base/YamlMixin.py

@classmethod
def model_validate_yaml(cls, yaml_data: str):
    """Validate a model from a YAML string."""
    try:
        # Parse YAML into a Python dictionary
        yaml_content = yaml.safe_load(yaml_data)

        # Convert the dictionary to JSON and validate using Pydantic
        return cls.model_validate_json(json.dumps(yaml_content))
    except yaml.YAMLError as e:
        raise ValueError(f"Invalid YAML data: {e}")
    except ValidationError as e:
        raise ValueError(f"Validation failed: {e}")

model_dump_yaml

model_dump_yaml(
    fields_to_exclude=None, api_key_placeholder=None
)

Return a YAML representation of the model.

Source code in swarmauri_base/YamlMixin.py

def model_dump_yaml(self, fields_to_exclude=None, api_key_placeholder=None):
    """Return a YAML representation of the model."""
    if fields_to_exclude is None:
        fields_to_exclude = []

    # Load the JSON string into a Python dictionary
    json_data = json.loads(self.model_dump_json())

    # Function to recursively remove specific keys and handle api_key placeholders
    def process_fields(data, fields_to_exclude):
        """Recursively filter fields and apply placeholders."""
        if isinstance(data, dict):
            return {
                key: (
                    api_key_placeholder
                    if key == "api_key" and api_key_placeholder is not None
                    else process_fields(value, fields_to_exclude)
                )
                for key, value in data.items()
                if key not in fields_to_exclude
            }
        elif isinstance(data, list):
            return [process_fields(item, fields_to_exclude) for item in data]
        else:
            return data

    # Filter the JSON data
    filtered_data = process_fields(json_data, fields_to_exclude)

    # Convert the filtered data into YAML using safe mode
    return yaml.safe_dump(filtered_data, default_flow_style=False)

model_post_init

model_post_init(logger=None)

Assign a logger instance after model initialization.

Source code in swarmauri_base/LoggerMixin.py

def model_post_init(self, logger: Optional[FullUnion[LoggerBase]] = None) -> None:
    """Assign a logger instance after model initialization."""

    # Directly assign the provided FullUnion[LoggerBase] or fallback to the
    # class-level default.
    self.logger = self.logger or logger or self.default_logger

Class swarmauri_standard.embeddings.TfidfEmbedding.TfidfEmbedding

swarmauri_standard.embeddings.TfidfEmbedding.TfidfEmbedding

type class-attribute instance-attribute

model_config class-attribute instance-attribute

id class-attribute instance-attribute

members class-attribute instance-attribute

owners class-attribute instance-attribute

host class-attribute instance-attribute

default_logger class-attribute

logger class-attribute instance-attribute

name class-attribute instance-attribute

resource class-attribute instance-attribute

version class-attribute instance-attribute

extract_features

fit

fit_transform

transform

infer_vector

save_model

load_model

register_model classmethod

register_type classmethod

model_validate_toml classmethod

model_dump_toml

model_validate_yaml classmethod

model_dump_yaml

model_post_init

Class `swarmauri_standard.embeddings.TfidfEmbedding.TfidfEmbedding`

type `class-attribute` `instance-attribute`

model_config `class-attribute` `instance-attribute`

id `class-attribute` `instance-attribute`

members `class-attribute` `instance-attribute`

owners `class-attribute` `instance-attribute`

host `class-attribute` `instance-attribute`

default_logger `class-attribute`

logger `class-attribute` `instance-attribute`

name `class-attribute` `instance-attribute`

resource `class-attribute` `instance-attribute`

version `class-attribute` `instance-attribute`

register_model `classmethod`

register_type `classmethod`

model_validate_toml `classmethod`

model_validate_yaml `classmethod`