Audio rag
In [ ]:
Copied!
pip install accelerate ffmpeg indexify
pip install accelerate ffmpeg indexify
In [5]:
Copied!
from indexify import IndexifyClient
client = IndexifyClient()
from indexify import IndexifyClient
client = IndexifyClient()
In [ ]:
Copied!
import urllib.request
urllib.request.urlretrieve(filename="ALLIN-E167.mp3", url="https://content.libsyn.com/p/5/d/f/5df17f8350f43745/ALLIN-E167.mp3?c_id=168165938&cs_id=168165938&destination_id=1928300&response-content-type=audio%2Fmpeg&Expires=1708908562&Signature=aLXFc-TXV3rb53Kg4ntr05zuHq3b5~6c03CPK4HNfhAwwbHbXf59Wan3NG2p0AzOfHWSqmK4A7sEarPdvi89hBMljCOpL~8HvX8aoGSmktgqkghQ4TDl~Q89DZ1gkuFHQ6BlIK5qAcdMA0NyfDhF5JN6UjI1Ja~OHuB6LjP-lZojmQddvfSdiI4oOJuODxQjffwZS1AaBO0z4yB4EcIqzwe46UWEe4ajL3mpgek9rb3ByyYarrewKq7-ZzpaKLFY6SN7cN2KMOPdyMvGw0rzNM9NfApLhyyEwDmDVOIhmoiqu2Act8U-1Oqcu23Hdm0MziL9ILPmRomu0XBfe9au1w__&Key-Pair-Id=K1YS7LZGUP96OI")
import urllib.request
urllib.request.urlretrieve(filename="ALLIN-E167.mp3", url="https://content.libsyn.com/p/5/d/f/5df17f8350f43745/ALLIN-E167.mp3?c_id=168165938&cs_id=168165938&destination_id=1928300&response-content-type=audio%2Fmpeg&Expires=1708908562&Signature=aLXFc-TXV3rb53Kg4ntr05zuHq3b5~6c03CPK4HNfhAwwbHbXf59Wan3NG2p0AzOfHWSqmK4A7sEarPdvi89hBMljCOpL~8HvX8aoGSmktgqkghQ4TDl~Q89DZ1gkuFHQ6BlIK5qAcdMA0NyfDhF5JN6UjI1Ja~OHuB6LjP-lZojmQddvfSdiI4oOJuODxQjffwZS1AaBO0z4yB4EcIqzwe46UWEe4ajL3mpgek9rb3ByyYarrewKq7-ZzpaKLFY6SN7cN2KMOPdyMvGw0rzNM9NfApLhyyEwDmDVOIhmoiqu2Act8U-1Oqcu23Hdm0MziL9ILPmRomu0XBfe9au1w__&Key-Pair-Id=K1YS7LZGUP96OI")
In [8]:
Copied!
client.upload_file(path="ALLIN-E167.mp3")
client.upload_file(path="ALLIN-E167.mp3")
In [9]:
Copied!
client.add_extraction_policy(extractor='tensorlake/whisper-asr', name="audio-transcription")
client.add_extraction_policy(extractor='tensorlake/whisper-asr', name="audio-transcription")
In [10]:
Copied!
client.add_extraction_policy(extractor='tensorlake/minilm-l6', name="transcription-embedding", content_source='audio-transcription')
client.add_extraction_policy(extractor='tensorlake/minilm-l6', name="transcription-embedding", content_source='audio-transcription')
In [15]:
Copied!
client.add_extraction_policy(extractor='tensorlake/minilm-l6', name="transcription-embedding1", content_source='audio-transcription', input_params={'chunk_size': 200, 'overlap': 50})
client.add_extraction_policy(extractor='tensorlake/minilm-l6', name="transcription-embedding1", content_source='audio-transcription', input_params={'chunk_size': 200, 'overlap': 50})
In [21]:
Copied!
client.add_extraction_policy(extractor='openai-embedding-ada-002-extractor', name="transcription-embedding2", content_source='audio-transcription', input_params={'chunk_size': 1000, 'overlap': 200})
client.add_extraction_policy(extractor='openai-embedding-ada-002-extractor', name="transcription-embedding2", content_source='audio-transcription', input_params={'chunk_size': 1000, 'overlap': 200})
In [3]:
Copied!
from indexify_langchain import IndexifyRetriever
from indexify_langchain import IndexifyRetriever
In [22]:
Copied!
params = {"name": "transcription-embedding2.embedding", "top_k": 50}
retriever = IndexifyRetriever(client=client, params=params)
params = {"name": "transcription-embedding2.embedding", "top_k": 50}
retriever = IndexifyRetriever(client=client, params=params)
In [23]:
Copied!
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
In [24]:
Copied!
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(openai_api_key="")
chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| model
| StrOutputParser()
)
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(openai_api_key="")
chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| model
| StrOutputParser()
)
In [25]:
Copied!
chain.invoke("Tell me about Grok")
chain.invoke("Tell me about Grok")
Out[25]:
'Grok is a company that has had a significant viral moment in its history recently. It was founded in 2016 and has been a long road for the company. The company has seen a surge in customers and interest, with 3,000 unique customers trying to consume their resources in a short period, ranging from Fortune 500 companies to developers. The company has been fortunate to experience this growth and potential disruption in the market. Time will tell how big the company can get, but there is a lot of market cap for Grok to gain by producing things at scale. The company has been described as a meager unicorn, with a last valuation of around a billion dollars. The potential for Grok to be disruptive in the market is significant, and it has had a very exciting and important moment in its history recently.'
In [ ]:
Copied!