Metadata Filtering
Set up¶
The following instructions assume that the Pipeline, Agent, and LLM object from the Quick Start example already exist and there is a Foundation4.ai API Server successfully running and accepting connections. Please see the installation instructions for details on how to set up the API server and the quick start example for instructions on how to create the necessary objects.
Test the connection¶
In order to ensure that the server is reachable, connecting to the API endpoint directly identifies the server.
import json
import os
import ssl
import httpx
import truststore
FOUNDATION4AI_API = os.environ.get('FOUNDATION4AI_API', 'https://api.foundation4ai.example.com') # replace with running server endpoint
FOUNDATION4AI_API = FOUNDATION4AI_API.strip('/')
api_key = os.environ.get('FOUNDATION4AI_API_KEY', '00000000-0000-0000-0000-000000000000') # replace with API KEY id
api_key_secret = os.environ.get('FOUNDATION4AI_API_SECRET', 'VERY SECRET PASSPHRASE') # replace with API KEY SECRET
HEADERS = {"x-api-key": api_key,"x-api-key-secret": api_key_secret}
ctx = truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
client = httpx.Client(verify=ctx, headers=HEADERS)
res = client.get(FOUNDATION4AI_API)
assert res.status_code == 200 and res.json()["message"] == "Foundation4.ai Server"
Check all objects from the quick start exist¶
res = client.get(f'{FOUNDATION4AI_API}/llms?name=Quick Start LLM Endpoint')
llm_id = res.json()['data'][0]['id']
res = client.get(f'{FOUNDATION4AI_API}/agents?name=Quick Start Basic Agent')
agent_id = res.json()['data'][0]['id']
print("LLM id :", llm_id)
print("Agent id :", agent_id)
LLM id : 2fd5b72c-3ed6-482d-b7bd-d29eabc8823f Agent id : a2a40d9e-8540-4f7f-9684-7064dd3f21dc
Schemas¶
Pipelines can have metadata schemas used to validate any metadata passed in when documents are created. Metadata schemas follow the JSON Schema specs with minimal restrictions. Currently, no $
directives are allowed ($ref
, $id
, $schema
, etc). Other constructs such as tuples and some arrays are not yet supported.
Note that setting the additionalProperties
to true
in the schema is also discouraged as it will break future schema field additions. Additionally, changing a schema in such a way that fields change the data type will lead to failed queries since the Foundation4.ai API Server will attempt to conform to the new schema.
The example below creates a pipeline with a schema allowing three fields, department
, username
, and start_year
:
schema = {
"type": "object",
"properties": {
"department": {
"type": "string",
"description": "The department to which the user belongs"
},
"username": {
"type": "string",
"description": "The username of the individual"
},
"start_year": {
"type": "integer",
"description": "The year the user started"
}
},
}
res = client.post(
f"{FOUNDATION4AI_API}/pipelines",
json={
"name": "Metadata Filtering Pipeline",
"embedding_model_id": "f48f032e-2883-4813-be57-d38cf06c5c42",
"default_text_splitter_id": "019252e9-b4a0-7713-9a69-d701b4f4a2d1",
"classifications": [
["secret", "classified"],
["classified", "public"],
],
"schema": schema
},
)
# note that this will fail if this code is executed twice as the Pipeline `name` must be unique
assert res.status_code == 200
print(json.dumps(res.json(), indent=4))
pipeline_id = res.json()["id"]
{ "id": "4a486865-5369-4075-b406-9ec39df242cf", "name": "Metadata Filtering Pipeline", "description": null, "embedding_model_id": "f48f032e-2883-4813-be57-d38cf06c5c42", "default_text_splitter_id": "019252e9-b4a0-7713-9a69-d701b4f4a2d1", "schema": { "properties": { "department": { "description": "The department to which the user belongs", "type": "string" }, "start_year": { "description": "The year the user started", "type": "integer" }, "username": { "description": "The username of the individual", "type": "string" } }, "type": "object" } }
The Foundation4.ai Dashboard (located at {FOUNDATION4AI_API}/dashboard
) can also be used to inspect the pipeline object and the associated schema:
Metadata¶
When documents are created, we can optionally specify metadata that will be added to the document. If the metadata does not validate with the schema from the pipeline, an immediate error will be reported:
res = client.post(
f"{FOUNDATION4AI_API}/pipelines/{pipeline_id}/documents",
json={
"classification": "public",
"contents": "User full name is John Doe",
"metadata": {
"department": 1,
"username": 2
}
},
)
print(json.dumps(res.json(), indent=2))
{ "message": "Invalid document metadata", "fields": [ { "department": "1 is not of type \"string\"", "username": "2 is not of type \"string\"" } ], "type": "schema error" }
Fixing the types above will lead to a successfully processed document.
res = client.post(
f"{FOUNDATION4AI_API}/pipelines/{pipeline_id}/documents",
json={
"classification": "public",
"contents": "User full name is John Doe",
"metadata": {
"department": "A",
"username": "user123"
}
},
)
assert res.status_code == 200
document_id = res.json()['id']
Note that the metadata will also be returned when checking the status of a document:
res = client.get(f"{FOUNDATION4AI_API}/documents/{document_id}")
assert res.status_code == 200
print(json.dumps(res.json(), indent=4))
{ "id": "539fd810-ff38-46f0-b3a5-3f34d20e624a", "external_identifier": null, "pipeline_id": "4a486865-5369-4075-b406-9ec39df242cf", "text_splitter_id": null, "classification": "public", "status": "success", "message": null, "metadata": { "department": "A", "username": "user123" } }
Filtering with Metadata¶
To show how filtering works, let's first create a few more documents with different value for each of the supported tags.
docs = [
["User full name is Jane Doe", "B", "user456"],
["User full name is Johnny Doe", "C", "user789"],
]
for contents, dept, user in docs:
res = client.post(
f"{FOUNDATION4AI_API}/pipelines/{pipeline_id}/documents",
json={
"classification": "public",
"contents": contents,
"metadata": {
"department": dept,
"username": user
}
},
)
assert res.status_code == 200
To filter messages that belong to department A
, we can use an equality operator on the department
field:
res = client.post(
f"{FOUNDATION4AI_API}/agents/{agent_id}/execute",
headers={'X-LLM-ID': llm_id, 'X-Pipeline-ID': pipeline_id},
json={
"prompt": {"query": "What's the user name?"},
"classification": "public",
"filters": {
"department": {"$eq": "A"}
}
},
)
assert res.status_code == 200
print(res.text)
The user's full name is John Doe.
res = client.post(
f"{FOUNDATION4AI_API}/agents/{agent_id}/execute",
headers={'X-LLM-ID': llm_id, 'X-Pipeline-ID': pipeline_id},
json={
"prompt": {"query": "What's the user name?"},
"classification": "public",
"filters": {
"department": {"$eq": "C"}
}
},
)
assert res.status_code == 200
print(res.text)
The user name is Johnny Doe.
Without any filtering, all the documents would match and the LLM would simply construct a response from the three documents above:
res = client.post(
f"{FOUNDATION4AI_API}/agents/{agent_id}/execute",
headers={'X-LLM-ID': llm_id, 'X-Pipeline-ID': pipeline_id},
json={
"prompt": {"query": "What's the user name?"},
"classification": "public",
},
)
assert res.status_code == 200
print(res.text)
The user name is Jane Doe, John Doe, and Johnny Doe.
Supported filters¶
There are two types of filters available,
- Field filters
- Expression logical filters
Field filters behave very similarly to the equality $eq
operator above. Those should be specified for each field we intend filtering on.
Supported fields are:
Note that there's no limit on the number of filters to apply in one query. For example,
"filters": {
"department": {"$ge": "A", "$lt": "C}
}
would match departments A
and B
from the documents above.
Additionally, we can also specify logical filter expressions for more advanced queries. Supported operations are:
$and
: logical AND operator$or
: logical OR operator$not
: logical NOT operator
For example, the example above could be written in different ways:
"filters": {
$and": [
{"department": {"$ge": "A"}},
{"department": {"$lt": "C"}}
]
}
or, taking into account that the result should be departments A
and B
and there are only 3 departments,
"filters": {
$or": [
{"department": {"$eq": "A"}},
{"department": {"$eq": "B"}}
]
}
or
"filters": {
$not": [
{"department": {"$eq": "C"}}
]
}
Taxonomies¶
Taxonomies are sets of tags with defined relationships. For example, we may want to have to specify where departments are located geographically and later allow queries at the geographical location level.
To do so, let's modify the pipeline schema to allow a new field, city
:
schema = {
"type": "object",
"properties": {
"department": {
"type": "string",
"description": "The department to which the user belongs"
},
"username": {
"type": "string",
"description": "The username of the individual"
},
"start_year": {
"type": "integer",
"description": "The year the user started"
},
"city": {
"type": "string",
"description": "City where the department is located"
}
},
}
res = client.patch(
f"{FOUNDATION4AI_API}/pipelines/{pipeline_id}",
json={
"schema":schema
},
)
assert res.status_code == 200
We can verify that the PATCH update operation succeeded by inspecting the returned schema for the pipeline.
res = client.get(f'{FOUNDATION4AI_API}/pipelines/{pipeline_id}')
print(json.dumps(res.json()['schema'], indent=2))
{ "properties": { "city": { "description": "City where the department is located", "type": "string" }, "department": { "description": "The department to which the user belongs", "type": "string" }, "start_year": { "description": "The year the user started", "type": "integer" }, "username": { "description": "The username of the individual", "type": "string" } }, "type": "object" }
Although we could edit every document from each department to also specify the city, it's a lot easier to say that each department is in some city. To do that, we can create a taxonomy that relates cities to departments.
res = client.post(
f"{FOUNDATION4AI_API}/taxonomies",
json={
"name": "Metadata Filtering Taxonomy",
},
)
# note that this will fail if this code is executed twice as the Taxonomy `name` must be unique
assert res.status_code == 200
print(json.dumps(res.json(), indent=4))
taxonomy_id = res.json()["id"]
{ "id": "012954f9-0f5b-4d11-934a-9c1233560fa8", "name": "Metadata Filtering Taxonomy", "description": null }
res = client.post(
f"{FOUNDATION4AI_API}/taxonomies/{taxonomy_id}/entries",
json=[
{
"parent": "city",
"parent_value": "City1",
"child": "department",
"child_value": "A",
},
{
"parent": "city",
"parent_value": "City1",
"child": "department",
"child_value": "B",
},
{
"parent": "city",
"parent_value": "City2",
"child": "department",
"child_value": "C",
},
],
)
Filtering with a taxonomy on City1
will now yield results for departments A
and B
. To do so, use the $teq
field operator, specifying the id
of the intended taxonomy to use:
res = client.post(
f"{FOUNDATION4AI_API}/agents/{agent_id}/execute",
headers={'X-LLM-ID': llm_id, 'X-Pipeline-ID': pipeline_id},
json={
"prompt": {"query": "What are the user names?"},
"classification": "public",
"filters": {
"city": {"$teq": ["City1", str(taxonomy_id)]}
}
},
)
assert res.status_code == 200
print(res.text)
The user names are Jane Doe and John Doe.
Doing the same for City2
should yield results from department C
:
res = client.post(
f"{FOUNDATION4AI_API}/agents/{agent_id}/execute",
headers={'X-LLM-ID': llm_id, 'X-Pipeline-ID': pipeline_id},
json={
"prompt": {"query": "What are the user names?"},
"classification": "public",
"filters": {
"city": {"$teq": ["City2", str(taxonomy_id)]}
}
},
)
assert res.status_code == 200
print(res.text)
The user's full name is Johnny Doe.