Hello @Patrick Martin Castro ,
Refer to this doc for using azure-search-documents and the Azure SDK for Python, applying data chunking and vectorization in an indexer pipeline.
the Storage Blob Data Reader role to storage
In the OpenAI resource, have you added the Cognitive Services OpenAI User role? Have you included the sample "gpt-4" module before adding "text-embedding-ada-002" using this DOC
We need to have the OpenAI resources and supported models.
Check the supported provider and models here.
Since you already having the models and embedding endpoints with in ai services you can use either sdk or rest api,
below is the sample to create indexer on json array data via python sdk
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
SearchIndexer,
IndexingParameters,
IndexingParametersConfiguration,
SearchIndexerDataSourceConnection,
SearchIndexerDataContainer,
SearchIndexerSkillset,
FieldMapping,
InputFieldMappingEntry,
OutputFieldMappingEntry,
AzureOpenAIEmbeddingSkill,
)
# Azure Search Service Configuration
service_name = "ai_search_name"
index_name = "index_name"
api_key = "your-azure-search-api-key"
endpoint = f"https://{service_name}.search.windows.net/"
# Azure Blob Storage Configuration
blob_connection_string = "your-blob-connection-string"
container_name = "aicsvdata"
# Azure OpenAI Configuration
azure_openai_service = "your-openai-service-name"
azure_openai_api_key = "your-openai-api-key"
azure_openai_embedding_deployment = "text-embedding-ada-002"
def create_indexer_with_skillset(index_client, indexer_client):
"""Creates an indexer with a skillset to vectorize the 'plot' field."""
data_source_name = "blob-datasource"
indexer_name = "blob-indexer-with-vector"
skillset_name = "plot-vectorization-skillset"
# 1. Data Source
container = SearchIndexerDataContainer(name=container_name)
data_source = SearchIndexerDataSourceConnection(
name=data_source_name,
connection_string=blob_connection_string,
container=container,
type="azureblob",
)
# 2. Skillset - Embedding Generation
embedding_skill = AzureOpenAIEmbeddingSkill(
name="plot-embedding",
description="Generates vector embeddings for plot field",
context="/document",
resource_url=f"https://{azure_openai_service}.openai.azure.com/",
api_key=azure_openai_api_key,
deployment_name=azure_openai_embedding_deployment,
model_name="text-embedding-ada-002",
inputs=[InputFieldMappingEntry(name="text", source="/document/plot")],
outputs=[OutputFieldMappingEntry(name="embedding", target_name="PlotVector")],
)
skillset = SearchIndexerSkillset(
name=skillset_name,
description="Skillset to vectorize plot",
skills=[embedding_skill],
)
# 3. Field Mappings
field_mappings = [
FieldMapping(source_field_name="/document/PlotVector", target_field_name="PlotVector"),
]
indexing_parameters = IndexingParameters(
query_timeout=None,
configuration=IndexingParametersConfiguration(
parsing_mode="jsonArray", # Options: 'default', 'delimitedText', 'json', 'jsonArray', 'jsonLines', 'text', 'markdown'
data_to_extract="contentAndMetadata" # Options: 'allMetadata', 'contentAndMetadata', 'storageMetadata'
)
)
# 4. Indexer
indexer = SearchIndexer(
name=indexer_name,
data_source_name=data_source_name,
target_index_name=index_name,
skillset_name=skillset_name,
parameters = indexing_parameters,
output_field_mappings = field_mappings
)
# Create skillset and data source
indexer_client.create_or_update_skillset(skillset)
indexer_client.create_or_update_data_source_connection(data_source)
# Create and run the indexer
result = indexer_client.create_or_update_indexer(indexer)
print(f"Indexer '{result.name}' created.")
indexer_client.run_indexer(indexer_name)
def main():
try:
credential = AzureKeyCredential(api_key)
index_client = SearchIndexClient(endpoint, credential)
indexer_client = SearchIndexerClient(endpoint, credential)
create_indexer_with_skillset(index_client, indexer_client)
except Exception as ex:
print(f"An error occurred: {ex}")
if __name__ == "__main__":
main()