High severityNVD Advisory· Published Sep 5, 2024· Updated Sep 5, 2024
MindsDB Vulnerable to Bypass of SSRF Protection with DNS Rebinding
CVE-2024-24759
Description
MindsDB is a platform for building artificial intelligence from enterprise data. Prior to version 23.12.4.2, a threat actor can bypass the server-side request forgery protection on the whole website with DNS Rebinding. The vulnerability can also lead to denial of service. Version 23.12.4.2 contains a patch.
Affected packages
Versions sourced from the GitHub Security Advisory.
| Package | Affected versions | Patched versions |
|---|---|---|
mindsdbPyPI | < 23.12.4.2 | 23.12.4.2 |
Affected products
1Patches
15f7496481bd3Release v23.12.4.2 (#8573)
25 files changed · +939 −357
mindsdb/__about__.py+1 −1 modified@@ -1,6 +1,6 @@ __title__ = 'MindsDB' __package_name__ = 'mindsdb' -__version__ = '23.12.4.1' +__version__ = '23.12.4.2' __description__ = "MindsDB server, provides server capabilities to mindsdb native python library" __email__ = "jorge@mindsdb.com" __author__ = 'MindsDB Inc'
mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py+44 −14 modified@@ -1,6 +1,6 @@ import os -import json import openai +import json import contextlib from typing import Optional, Dict @@ -32,7 +32,6 @@ def __init__(self, *args, **kwargs): self.rate_limit = 25 # requests per minute self.max_batch_size = 20 self.default_max_tokens = 100 - self.ft_cls = openai.FineTuningJob # non-legacy fine-tuning endpoint @staticmethod @contextlib.contextmanager @@ -47,24 +46,25 @@ def _anyscale_base_api(key='OPENAI_API_BASE'): def create(self, target, args=None, **kwargs): with self._anyscale_base_api(): + # load base and fine-tuned models, then hand over self._set_models(args.get('using', {})) - - # load fine-tuned models, then hand over _args = self.model_storage.json_get('args') base_models = self.chat_completion_models self.chat_completion_models = _args.get('chat_completion_models', base_models) if _args else base_models super().create(target, args, **kwargs) def predict(self, df: pd.DataFrame, args: Optional[Dict] = None) -> pd.DataFrame: with self._anyscale_base_api(): - # load fine-tuned models, then hand over + # load base and fine-tuned models, then hand over + self._set_models(args.get('using', {})) _args = self.model_storage.json_get('args') base_models = self.chat_completion_models self.chat_completion_models = _args.get('chat_completion_models', base_models) if _args else base_models return super().predict(df, args) def finetune(self, df: Optional[pd.DataFrame] = None, args: Optional[Dict] = None) -> None: with self._anyscale_base_api(): + self._set_models(args.get('using', {})) super().finetune(df, args) # rewrite chat_completion_models to include the newly fine-tuned model args = self.model_storage.json_get('args') @@ -90,11 +90,10 @@ def describe(self, attribute: Optional[str] = None) -> pd.DataFrame: def _set_models(self, args): if 'api_key' in args: args['openai_api_key'] = args['api_key'] # remove this once #7496 is fixed - self.all_models = [m['id'] for m in openai.Model.list( - api_key=get_api_key('openai', args, self.engine_storage), - api_base=ANYSCALE_API_BASE)['data']] - self.chat_completion_models = self.all_models - self.supported_ft_models = self.all_models # base models compatible with fine-tuning + client = self._get_client(get_api_key('openai', args, self.engine_storage)) + self.all_models = [m.id for m in client.models.list()] + self.chat_completion_models = [m.id for m in client.models.list() if m.rayllm_metadata['engine_config']['model_type'] == 'text-generation'] # noqa + self.supported_ft_models = self.chat_completion_models # base models compatible with fine-tuning @staticmethod def _check_ft_cols(df, cols): @@ -107,19 +106,46 @@ def _prepare_ft_jsonl(self, df, temp_storage_path, temp_filename, _, test_size=0 df: has exactly two columns, `role` and `content`. Rows contain >= 1 chats in long (stacked) format. For more details, check `FineTuning -> Data Format` in the Anyscale API reference. """ + def _is_valid(chat): + """ Check if chat is valid according to Anyscale criteria.""" + roles = [m['role'] for m in chat] + transitions = {None: ['system', 'user'], 'system': ['user'], 'user': ['assistant'], 'assistant': ['user']} + + # check base condition + if not ('user' in roles and 'assistant' in roles): + return False + + # check order is valid + state = None + for role in roles: + if role not in transitions[state]: + return False + else: + state = role + + # chat is valid, return + return True + # 1. aggregate each chat sequence into one row chats = [] chat = [] for i, row in df.iterrows(): if row['role'] == 'system' and len(chat) > 0: - chats.append({'messages': chat}) + if _is_valid(chat): + chats.append({'messages': chat}) chat = [] event = {'role': row['role'], 'content': row['content']} chat.append(event) - chats.append({'messages': chat}) + + if _is_valid(chat): + chats.append({'messages': chat}) + series = pd.Series(chats) - train = series.iloc[:int(len(series) * (1 - test_size))] - val = series.iloc[-int(len(series) * test_size) - 1:] + if len(series) < 20 * 2: + raise Exception("Dataset is too small to finetune. Please include at least 40 samples (complete chats).") + val_size = max(20, int(len(series) * test_size)) # at least 20 samples required by Anyscale + train = series.iloc[:-val_size] + val = series.iloc[-val_size:] # 2. write as jsonl file_names = { @@ -190,3 +216,7 @@ def check_data_for_format_errors(items: list): check_data_for_format_errors(items) except Exception as e: raise Exception(f"Fine-tuning data format is not valid. Got: {e}") + + @staticmethod + def _get_client(api_key, base_url=ANYSCALE_API_BASE, org=None): + return openai.OpenAI(api_key=api_key, base_url=base_url, organization=org)
mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt+1 −1 modified@@ -1,2 +1,2 @@ -openai == 0.28.1 +openai == 1.6.1 -r mindsdb/integrations/handlers/openai_handler/requirements.txt
mindsdb/integrations/handlers/instatus_handler/instatus_handler.py+5 −9 modified@@ -1,4 +1,4 @@ -from mindsdb.integrations.handlers.instatus_handler.instatus_tables import StatusPages +from mindsdb.integrations.handlers.instatus_handler.instatus_tables import StatusPages, Components from mindsdb.integrations.libs.api_handler import APIHandler from mindsdb.integrations.libs.response import HandlerStatusResponse as StatusResponse from mindsdb.utilities import log @@ -31,6 +31,7 @@ def __init__(self, name: str, **kwargs) -> None: _tables = [ StatusPages, + Components ] for Table in _tables: @@ -91,7 +92,7 @@ def native_query(self, query: str) -> StatusResponse: ast = parse_sql(query, dialect="mindsdb") return self.query(ast) - def call_instatus_api(self, endpoint: str, method: str = 'GET', params: dict = None, data=None) -> pd.DataFrame: + def call_instatus_api(self, endpoint: str, method: str = 'GET', params: dict = None, json_data: dict = {}) -> pd.DataFrame: if not params: params = {} @@ -101,16 +102,11 @@ def call_instatus_api(self, endpoint: str, method: str = 'GET', params: dict = N if method.upper() in ('GET', 'POST', 'PUT', 'DELETE'): headers['Content-Type'] = 'application/json' - if method.upper() in ('POST', 'PUT', 'DELETE'): - response = requests.request(method, url, headers=headers, params=params, data=data) - else: - response = requests.get(url, headers=headers, params=params) + response = requests.request(method, url, headers=headers, params=params, json=json_data) if response.status_code == 200: data = response.json() - return pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame({ - 'data': data - }) + return pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame([data]) else: raise Exception(f"Error connecting to Instatus API: {response.status_code} - {response.text}")
mindsdb/integrations/handlers/instatus_handler/instatus_tables.py+231 −7 modified@@ -5,6 +5,11 @@ from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions from mindsdb_sql.parser.ast.select.constant import Constant import json +import re + +langCodes = ["ar", "cs", "da", "de", "en", "es", "et", "fi", "fr", "hu", "id", "it", "ja", "ko", + "nl", "no", "pl", "pt", "pt-BR", "ro", "rs", "ru", "sl", "sq", "sv", "tr", "uk", + "vi", "zh", "zh-TW"] class StatusPages(APITable): @@ -87,11 +92,16 @@ def insert(self, query: ast.Insert) -> None: """ data = {} for column, value in zip(query.columns, query.values[0]): - if isinstance(value, Constant): - data[column.name] = value.value - else: - data[column.name] = value - self.handler.call_instatus_api(endpoint='/v1/pages', method='POST', data=json.dumps(data)) + if isinstance(value, str): + try: + value = json.loads(value) + except json.JSONDecodeError: + if value == 'True': + value = True + elif value == 'False': + value = False + data[column.name] = value + self.handler.call_instatus_api(endpoint='/v1/pages', method='POST', json_data=data) def update(self, query: ast.Update) -> None: """Receive query as AST (abstract syntax tree) and act upon it somehow. @@ -114,10 +124,14 @@ def update(self, query: ast.Update) -> None: for key, value in query.update_columns.items(): if isinstance(value, Constant): if key == 'components': - data[key] = json.loads(value.value) # Convert 'components' value to a Python list + data[key] = json.loads(value.value) else: data[key] = value.value - self.handler.call_instatus_api(endpoint=f'/v2/{_id}', method='PUT', data=json.dumps(data)) + + if 'components' in data and isinstance(data['components'], str): + data['components'] = json.loads(data['components']) + + self.handler.call_instatus_api(endpoint=f'/v2/{_id}', method='PUT', json_data=data) def get_columns(self, ignore: List[str] = []) -> List[str]: """columns @@ -167,3 +181,213 @@ def get_columns(self, ignore: List[str] = []) -> List[str]: "createdAt", "updatedAt" ] + + +class Components(APITable): + + # table name in the database + name = 'components' + + def select(self, query: ast.Select) -> pd.DataFrame: + """Receive query as AST (abstract syntax tree) and act upon it. + + Args: + query (ASTNode): SQL query represented as AST. Usually it should be ast.Select + + Returns: + pd.DataFrame + """ + conditions = extract_comparison_conditions(query.where) + + if len(conditions) == 0: + raise Exception('WHERE clause is required') + + # Get page id and component id from query + pageId = None + componentId = None + for condition in conditions: + if condition[1] == 'page_id' and condition[0] == '=': + pageId = condition[2] + + if condition[1] == 'component_id' and condition[0] == '=': + componentId = condition[2] + + # Get column names from query + selected_columns = [] + for target in query.targets: + if isinstance(target, ast.Star): + selected_columns = self.get_columns() + break + elif isinstance(target, ast.Identifier): + selected_columns.append(target.parts[-1]) + else: + raise ValueError(f"Unknown query target {type(target)}") + + limit = query.limit.value if query.limit else None + if componentId: + # Call instatus API and get the response as pd.DataFrame + df = self.handler.call_instatus_api(endpoint=f'/v1/{pageId}/components/{componentId}') + for langCode in langCodes: + try: + df[f"translations_name_in_{langCode}"] = df["translations"].apply(lambda x: x.get("name", None)).apply(lambda x: x.get(langCode, None)) + df[f"translations_desc_in_{langCode}"] = df["translations"].apply(lambda x: x.get("description", None)).apply(lambda x: x.get(langCode, None)) + except AttributeError: + df[f"translations_name_in_{langCode}"] = None + df[f"translations_desc_in_{langCode}"] = None + df = df.drop(columns=["translations"]) + + result_df = df[selected_columns] + else: + # Call instatus API and get the response as pd.DataFrame + page_size = 100 + # Calculate the number of pages required + page_count = (limit + page_size - 1) // page_size if limit else 1 + result_df = pd.DataFrame(columns=selected_columns) + + # Call instatus API and get the response as pd.DataFrame for each page + for page in range(1, page_count + 1): + current_page_size = min(page_size, limit) if limit else page_size + + df = self.handler.call_instatus_api(endpoint=f'/v1/{pageId}/components', params={'page': page, 'per_page': current_page_size}) + # Break if no more data is available or limit is reached + if len(df) == 0 or (limit and limit <= 0) or limit == 0: + break + ''' Add translations_name_in_{langCode} and translations_desc_in_{langCode} columns to the dataframe''' + for i in range(len(df)): + for langCode in langCodes: + try: + df.at[i, f"translations_name_in_{langCode}"] = df.at[i, "translations"].get("name", {}).get(langCode, None) + df.at[i, f"translations_desc_in_{langCode}"] = df.at[i, "translations"].get("description", {}).get(langCode, None) + except AttributeError: + df.at[i, f"translations_name_in_{langCode}"] = None + df.at[i, f"translations_desc_in_{langCode}"] = None + + # Drop the 'translations' column + df = df.drop(columns=["translations"]) + # Concatenate the dataframes + result_df = pd.concat([result_df, df[selected_columns]], ignore_index=True) + + if limit: + limit -= len(df) + + return result_df + + def insert(self, query: ast.Insert) -> None: + """Receive query as AST (abstract syntax tree) and act upon it somehow. + + Args: + query (ASTNode): sql query represented as AST. Usually it should be ast.Insert + + Returns: + None + """ + data = {'translations': { + "name": {}, + "description": {} + }} + + for column, value in zip(query.columns, query.values[0]): + if isinstance(value, Constant): + data[column.name] = json.loads(value.value) if column.name == 'translations' else value.value + elif isinstance(value, str): + try: + if re.match(r'^translations_name_in_[a-zA-Z\-]+$', column.name): + lang_code = column.name.split('_')[-1] + if lang_code not in langCodes: + raise Exception(f'Invalid language code {lang_code}') + data['translations']['name'][lang_code] = value + elif re.match(r'^translations_desc_in_[a-zA-Z\-]+$', column.name): + lang_code = column.name.split('_')[-1] + if lang_code not in langCodes: + raise Exception(f'Invalid language code {lang_code}') + data['translations']['description'][lang_code] = value + else: + data[column.name] = json.loads(value) + except json.JSONDecodeError: + data[column.name] = True if value == 'True' else (False if value == 'False' else value) + + page_id = data.pop('page_id', None) + + if page_id is not None: + self.handler.call_instatus_api(endpoint=f'/v1/{page_id}/components', method='POST', json_data=data) + + def update(self, query: ast.Update) -> None: + """Receive query as AST (abstract syntax tree) and act upon it somehow. + + Args: + query (ASTNode): sql query represented as AST. Usually it should be ast.Update + Returns: + None + """ + conditions = extract_comparison_conditions(query.where) + # Get page id and component id from query + pageId = None + componentId = None + for condition in conditions: + if condition[1] == 'page_id' and condition[0] == '=': + pageId = condition[2] + elif condition[1] == 'component_id' and condition[0] == '=': + componentId = condition[2] + else: + raise Exception("page_id and component_id both are required") + + data = {'translations': { + "name": {}, + "description": {} + }} + for key, value in query.update_columns.items(): + if isinstance(value, Constant): + if re.match(r'^translations_name_in_[a-zA-Z\-]+$', key): + lang_code = key.split('_')[-1] + if lang_code not in langCodes: + raise Exception(f'Invalid language code {lang_code}') + data['translations']['name'][lang_code] = value.value + elif re.match(r'^translations_desc_in_[a-zA-Z\-]+$', key): + lang_code = key.split('_')[-1] + if lang_code not in langCodes: + raise Exception(f'Invalid language code {lang_code}') + data['translations']['description'][lang_code] = value.value + else: + data[key] = value.value + self.handler.call_instatus_api(endpoint=f'/v1/{pageId}/components/{componentId}', method='PUT', json_data=data) + + def get_columns(self, ignore: List[str] = []) -> List[str]: + """columns + + Args: + ignore (List[str], optional): exclusion items. Defaults to []. + + Returns: + List[str]: available columns with `ignore` items removed from the list. + """ + return [ + "id", + "name", + "nameTranslationId", + "description", + "descriptionTranslationId", + "status", + "order", + "showUptime", + "createdAt", + "updatedAt", + "archivedAt", + "siteId", + "uniqueEmail", + "oldGroup", + "groupId", + "isParent", + "isCollapsed", + "monitorId", + "nameHtml", + "nameHtmlTranslationId", + "descriptionHtml", + "descriptionHtmlTranslationId", + "isThirdParty", + "thirdPartyStatus", + "thirdPartyComponentId", + "thirdPartyComponentServiceId", + "importedFromStatuspage", + "startDate", + "group", + ] + [f'translations_name_in_{langCode}' for langCode in langCodes] + [f'translations_desc_in_{langCode}' for langCode in langCodes]
mindsdb/integrations/handlers/instatus_handler/README.md+101 −41 modified@@ -17,11 +17,49 @@ Please follow this [link](https://dashboard.instatus.com/developer) to get the a ## Implemented Features - [x] Instatus status pages table - - [x] Get status pages - - [x] Create a status page - - [x] Update a status page - -## Example Usage + - [x] Support SELECT + - [x] Support INSERT + - [x] Support UPDATE +- [x] Instatus components table + - [x] Support SELECT + - [x] Support INSERT + - [x] Support UPDATE + +## TODO +- [ ] Instatus Incidents table + - [ ] Support SELECT + - [ ] Support INSERT + - [ ] Support UPDATE +- [ ] Instatus Incidents updates table + - [ ] Support SELECT + - [ ] Support INSERT + - [ ] Support UPDATE +- [ ] Instatus Maintenances table + - [ ] Support SELECT + - [ ] Support INSERT + - [ ] Support UPDATE +- [ ] Instatus Maintenance updates table + - [ ] Support SELECT + - [ ] Support INSERT + - [ ] Support UPDATE +- [ ] Instatus Templates table + - [ ] Support SELECT + - [ ] Support INSERT + - [ ] Support UPDATE +- [ ] Instatus Teammates table + - [ ] Support SELECT + - [ ] Support INSERT + - [ ] Support UPDATE +- [ ] Instatus Subscribers table + - [ ] Support SELECT + - [ ] Support INSERT + - [ ] Support UPDATE +- [ ] Instatus Metrics table + - [ ] Support SELECT + - [ ] Support INSERT + - [ ] Support UPDATE + +## Connection The first step is to create a database with the new `instatus` engine. @@ -33,47 +71,24 @@ WITH "api_key": "<your-instatus-api-key>" --- Instatus API key to use for authentication. }; ``` +## Usage (Status pages table) -### Get your status pages - -Example 1: Select all columns +### SELECT ```sql SELECT * FROM mindsdb_instatus.status_pages; ``` -Example 2: Select specific columns - -```sql -SELECT id, name, status -FROM mindsdb_instatus.status_pages; -``` - -Example 3: Get specific status page +### WHERE ```sql SELECT * FROM mindsdb_instatus.status_pages WHERE id = '<status-page-id>'; ``` -Example 4: Apply limit - -```sql -SELECT * -FROM mindsdb_instatus.status_pages -LIMIT 10; -``` - -### Create a status page - -```sql -INSERT INTO mindsdb_instatus.status_pages (column1, column2, column3, ...) -VALUES (value1, value2, value3, ...); -``` - -Example: +### INSERT ```sql INSERT INTO mindsdb_instatus.status_pages (email, name, subdomain, components, logoUrl, faviconUrl, websiteUrl, language, useLargeHeader, brandColor, okColor, disruptedColor, degradedColor, downColor, noticeColor, unknownColor, googleAnalytics, subscribeBySms, smsService, twilioSid, twilioToken, twilioSender, nexmoKey, nexmoSecret, nexmoSender, htmlInMeta, htmlAboveHeader, htmlBelowHeader, htmlAboveFooter, htmlBelowFooter, htmlBelowSummary, cssGlobal, launchDate, dateFormat, dateFormatShort, timeFormat) @@ -88,15 +103,7 @@ Note: - `components` is required field (Example: '["Website", "App", "API"]') - other fields are optional -### Update a status page - -```sql -UPDATE mindsdb_instatus.status_pages -SET column1 = value1, column2 = value2, ... -WHERE id = '<status-page-id>'; -``` - -Example: +### UPDATE ```sql UPDATE mindsdb_instatus.status_pages @@ -145,3 +152,56 @@ SET name = 'mindsdb', }' WHERE id = '<status-page-id>'; ``` + +## Usage (Components table) + +### SELECT + +```sql +SELECT * +FROM mindsdb_instatus.components +WHERE page_id = '<status-page-id>'; +``` + +### WHERE + +```sql +SELECT * +FROM mindsdb_instatus.components +WHERE page_id = '<status-page-id>' +AND component_id = '<component-id>'; +``` + +### CREATE + +```sql +INSERT INTO mindsdb_instatus.components (page_id, name, description, status, order, showUptime, grouped, translations_name_in_fr, translations_desc_in_fr) +VALUES ( + '<page-id>', + 'Test component', + 'Testing', + 'OPERATIONAL', + 6, + true, + false, + "Composant de test", + "En test" +); +``` + +### UPDATE + +```sql +UPDATE mindsdb_instatus.components +SET + name = 'Test component 4', + description = 'Test test test', + status = 'OPERATIONAL', + order = 6, + showUptime = true, + grouped = false, + translations_name_in_fr = "Composant de test 4", + translations_desc_in_fr = "Test test test" +WHERE page_id = '<status-page-id>' +AND component_id = '<component-id>'; +```
mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt+1 −1 modified@@ -1,2 +1,2 @@ -openai == 0.28.1 +openai == 1.6.1 tiktoken~=0.4.0
mindsdb/integrations/handlers/langchain_handler/langchain_handler.py+3 −2 modified@@ -184,6 +184,7 @@ def _get_chat_model_params(self, args, pred_args): model_kwargs['best_of'] = pred_args.get('best_of', None) model_kwargs['logit_bias'] = pred_args.get('logit_bias', None) model_kwargs['openai_api_key'] = get_api_key('openai', args, self.engine_storage) + model_kwargs['openai_organization'] = args.get('api_organization', None) model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None} # filter out None values return model_kwargs @@ -418,9 +419,9 @@ def finetune(self, df: Optional[pd.DataFrame] = None, args: Optional[Dict] = Non def sql_agent_completion(self, df, args=None, pred_args=None): """This completion will be used to answer based on information passed by any MindsDB DB or API engine.""" db = MindsDBSQL(engine=args['executor'], metadata=args['executor'].session.integration_controller) - toolkit = SQLDatabaseToolkit(db=db) model_name = args.get('model_name', self.default_model) - llm = OpenAI(temperature=0) if model_name not in OPEN_AI_CHAT_MODELS else ChatOpenAI(temperature=0) + llm = OpenAI(temperature=0) if model_name not in OPEN_AI_CHAT_MODELS else self._create_chat_model(args, pred_args) # noqa + toolkit = SQLDatabaseToolkit(db=db, llm=llm) agent = create_sql_agent( llm=llm, toolkit=toolkit,
mindsdb/integrations/handlers/langchain_handler/requirements.txt+1 −1 modified@@ -1,4 +1,4 @@ -openai == 0.28.1 +openai == 1.6.1 wikipedia==1.4.0 tiktoken >= 0.3.0 anthropic==0.3.5
mindsdb/integrations/handlers/llama_index_handler/__about__.py+8 −8 modified@@ -1,9 +1,9 @@ -__title__ = 'MindsDB LlamaIndex handler' -__package_name__ = 'mindsdb_llama_index_handler' -__version__ = '0.0.1' +__title__ = "MindsDB LlamaIndex handler" +__package_name__ = "mindsdb_llama_index_handler" +__version__ = "0.0.1" __description__ = "MindsDB handler for LlamaIndex" -__author__ = 'Balaji Seetharaman ' -__github__ = 'https://github.com/mindsdb/mindsdb' -__pypi__ = 'https://pypi.org/project/mindsdb/' -__license__ = 'MIT' -__copyright__ = 'Copyright 2023 - mindsdb' +__author__ = "Balaji Seetharaman " +__github__ = "https://github.com/mindsdb/mindsdb" +__pypi__ = "https://pypi.org/project/mindsdb/" +__license__ = "MIT" +__copyright__ = "Copyright 2023 - mindsdb"
mindsdb/integrations/handlers/llama_index_handler/config.py+18 −0 added@@ -0,0 +1,18 @@ +# this dict is used to configure the data loader and match one on one arguments passed by the user in args +data_loaders = { + "DFReader": { + # TODO: add parameter of DFReader + }, + "SimpleWebPageReader": { + "source_url_link": "<url>", + }, + "GithubRepositoryReader": { + "owner": "<owner>", + "repo": "<repository>", + "github_token": "<github_token>", + "branch": "<branch>", + }, + "YoutubeTranscriptReader": { + "ytlinks": ["<youtube_link>"], + }, +}
mindsdb/integrations/handlers/llama_index_handler/github_loader_helper.py+65 −0 added@@ -0,0 +1,65 @@ +import os +from llama_hub.github_repo import GithubRepositoryReader + + +def _get_github_token(args, connection_args): + """ + API_KEY preference order: + 1. provided at model creation + 2. provided at engine creation + 3. GITHUB_TOKEN env variable + + Note: method is not case-sensitive. + """ + key = "GITHUB_TOKEN" + for k in key, key.lower(): + if args.get(k): + return args[k] + + connection_args = connection_args + if connection_args.get(k): + return connection_args[k] + + api_key = os.getenv(k) + if os.environ.get(k): + return api_key + + return None + + +def _get_filter_file_extensions(args): + """ + Returns file extensions to filter, if Filter type is EXCLUDE the file extensions will be excluded + from the knowledge source, if Filter type is INCLUDE the file extensions will be included in the + knowledge source. + """ + # filter_file_extensions is not provided + if "filter_file_extensions" not in args: + return None + + # if filter_type is provided with EXCLUDE or INCLUDE + if args["filter_file_extensions"] and args["filter_type"].upper() == "INCLUDE": + filter_file_extensions = args["filter_file_extensions"] + return (filter_file_extensions, GithubRepositoryReader.FilterType.INCLUDE) + else: + filter_file_extensions = args["filter_file_extensions"] + return (filter_file_extensions, GithubRepositoryReader.FilterType.EXCLUDE) + + +def _get_filter_directories(args): + """ + Returns directories to filter, if Filter type is EXCLUDE the directories will be excluded + from the knowledge source, if Filter type is INCLUDE the directories will be included in the + knowledge source. + """ + # filter_directories is not provided + if "filter_directories" not in args: + return None + + # if filter_type is provided with EXCLUDE or INCLUDE + if args["filter_directories"] and args["filter_type"].upper() == "INCLUDE": + filter_directories = args["filter_directories"] + return (filter_directories, GithubRepositoryReader.FilterType.INCLUDE) + else: + filter_directories = args["filter_directories"] + return (filter_directories, GithubRepositoryReader.FilterType.EXCLUDE)
mindsdb/integrations/handlers/llama_index_handler/__init__.py+6 −6 modified@@ -1,18 +1,18 @@ from mindsdb.integrations.libs.const import HANDLER_TYPE from .__about__ import __version__ as version, __description__ as description + try: - from .llama_index_handler import LlamaIndexHandler as Handler + from .llama_index_handler import LlamaIndexHandler as Handler + import_error = None except Exception as e: Handler = None import_error = e -title = 'LlamaIndex' -name = 'llama_index' +title = "LlamaIndex" +name = "llama_index" type = HANDLER_TYPE.ML permanent = True -__all__ = [ - 'Handler', 'version', 'name', 'type', 'title', 'description', 'import_error' -] +__all__ = ["Handler", "version", "name", "type", "title", "description", "import_error"]
mindsdb/integrations/handlers/llama_index_handler/llama_index_handler.py+211 −136 modified@@ -3,229 +3,304 @@ import openai import pandas as pd -from langchain.llms import OpenAI import llama_index + +from langchain.llms import OpenAI from llama_index.readers.schema.base import Document -from llama_index import SimpleWebPageReader, QuestionAnswerPrompt +from llama_index.readers import SimpleWebPageReader +from llama_index.prompts import PromptTemplate from llama_index import ServiceContext, StorageContext, load_index_from_storage from llama_index import LLMPredictor, OpenAIEmbedding from llama_index.indices.vector_store.base import VectorStore +from llama_hub.github_repo import GithubClient, GithubRepositoryReader +from llama_hub.youtube_transcript import YoutubeTranscriptReader, is_youtube_video + from mindsdb.integrations.libs.base import BaseMLEngine from mindsdb.utilities.config import Config from mindsdb.utilities.security import is_private_url +from mindsdb.integrations.handlers.llama_index_handler import config +from mindsdb.integrations.handlers.llama_index_handler.github_loader_helper import ( + _get_github_token, + _get_filter_file_extensions, + _get_filter_directories, +) +from mindsdb.integrations.utilities.handler_utils import get_api_key def _validate_prompt_template(prompt_template: str): - if '{context_str}' not in prompt_template or '{query_str}' not in prompt_template: + if "{context_str}" not in prompt_template or "{query_str}" not in prompt_template: raise Exception( - "Provided prompt template is invalid, missing `{context_str}`, `{query_str}`. Please ensure both placeholders are present and try again.") # noqa + "Provided prompt template is invalid, missing `{context_str}`, `{query_str}`. Please ensure both placeholders are present and try again." + ) # noqa class LlamaIndexHandler(BaseMLEngine): - """ Integration with the LlamaIndex data framework for LLM applications. """ - name = 'llama_index' + """Integration with the LlamaIndex data framework for LLM applications.""" + + name = "llama_index" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.generative = True - self.default_index_class = 'GPTVectorStoreIndex' - self.supported_index_class = ['GPTVectorStoreIndex'] - self.default_reader = 'DFReader' - self.supported_reader = ['DFReader', 'SimpleWebPageReader'] + self.default_index_class = "GPTVectorStoreIndex" + self.supported_index_class = ["GPTVectorStoreIndex", "VectorStoreIndex"] + self.default_reader = "DFReader" + self.supported_reader = [ + "DFReader", + "SimpleWebPageReader", + "GithubRepositoryReader", + "YoutubeTranscriptReader", + ] @staticmethod def create_validation(target, args=None, **kwargs): - if 'prompt_template' in args['using']: - _validate_prompt_template(args['using']['prompt_template']) - - if args['using'].get('mode') == 'conversational': - for param in ('user_column', 'assistant_column'): - if param not in args['using']: - raise Exception(f'Conversational mode requires {param} parameter') - - def create(self, target: str, df: Optional[pd.DataFrame] = None, args: Optional[Dict] = None) -> None: - if 'using' not in args: - raise Exception("LlamaIndex engine requires a USING clause! Refer to its documentation for more details.") - - if 'index_class' not in args['using']: - args['using']['index_class'] = self.default_index_class - elif args['using']['index_class'] not in self.supported_index_class: - raise Exception(f"Invalid index class argument. Please use one of {self.supported_index_class}") - - if 'reader' not in args['using']: - args['using']['reader'] = self.default_reader - elif args['using']['reader'] not in self.supported_reader: - raise Exception(f"Invalid operation mode. Please use one of {self.supported_reader}") + reader = args["using"].get("reader", None) + if reader not in config.data_loaders: + raise Exception( + f"Invalid reader argument. Please use one of {config.data_loaders.keys()}" + ) + config_dict = config.data_loaders[reader] + + missing_keys = [key for key in config_dict if key not in args["using"]] + if missing_keys: + raise Exception(f"{reader} requires {missing_keys} arguments") + + if "prompt_template" in args["using"]: + _validate_prompt_template(args["using"]["prompt_template"]) + + if args["using"].get("mode") == "conversational": + for param in ("user_column", "assistant_column"): + if param not in args["using"]: + raise Exception(f"Conversational mode requires {param} parameter") + + def create( + self, + target: str, + df: Optional[pd.DataFrame] = None, + args: Optional[Dict] = None, + ) -> None: + if "using" not in args: + raise Exception( + "LlamaIndex engine requires a USING clause! Refer to its documentation for more details." + ) + + if "index_class" not in args["using"]: + args["using"]["index_class"] = self.default_index_class + elif args["using"]["index_class"] not in self.supported_index_class: + raise Exception( + f"Invalid index class argument. Please use one of {self.supported_index_class}" + ) + + if "reader" not in args["using"]: + args["using"]["reader"] = self.default_reader + elif args["using"]["reader"] not in self.supported_reader: + raise Exception( + f"Invalid operation mode. Please use one of {self.supported_reader}" + ) # workaround to create llama model without input data if df is None or df.empty: - df = pd.DataFrame([{'text': ''}]) - - if args['using']['reader'] == 'DFReader': - dstrs = df.apply(lambda x: ', '.join([f'{col}: {str(entry)}' for col, entry in zip(df.columns, x)]), axis=1) + df = pd.DataFrame([{"text": ""}]) + + if args["using"]["reader"] == "DFReader": + dstrs = df.apply( + lambda x: ", ".join( + [f"{col}: {str(entry)}" for col, entry in zip(df.columns, x)] + ), + axis=1, + ) reader = list(map(lambda x: Document(text=x), dstrs.tolist())) - elif args['using']['reader'] == 'SimpleWebPageReader': - if 'source_url_link' not in args['using']: - raise Exception("SimpleWebPageReader requires a `source_url_link` parameter. Refer to LlamaIndex documentation for more details.") # noqa - - url = args['using']['source_url_link'] + elif args["using"]["reader"] == "SimpleWebPageReader": + url = args["using"]["source_url_link"] config = Config() is_cloud = config.get("cloud", False) if is_cloud and is_private_url(url): - raise Exception(f'URL is private: {url}') + raise Exception(f"URL is private: {url}") reader = SimpleWebPageReader(html_to_text=True).load_data([url]) + elif args["using"]["reader"] == "GithubRepositoryReader": + engine_storage = self.engine_storage + + key = "GITHUB_TOKEN" + github_token = get_api_key( + key, "llama_index", args["using"], engine_storage, strict=False + ) + if github_token is None: + github_token = get_api_key( + key.lower(), + "llama_index", + args["using"], + engine_storage, + strict=True, + ) + + github_client = GithubClient(github_token) + owner = args["using"]["owner"] + repo = args["using"]["repo"] + filter_file_extensions = _get_filter_file_extensions(args["using"]) + filter_directories = _get_filter_directories(args["using"]) + + reader = GithubRepositoryReader( + github_client, + owner=owner, + repo=repo, + verbose=True, + filter_file_extensions=filter_file_extensions, + filter_directories=filter_directories, + ).load_data(branch=args["using"].get("branch", "main")) + + elif args["using"]["reader"] == "YoutubeTranscriptReader": + ytlinks = args["using"]["ytlinks"] + for link in ytlinks: + if not is_youtube_video(link): + raise Exception(f"Invalid youtube link: {link}") + reader = YoutubeTranscriptReader().load_data(ytlinks) + else: - raise Exception(f"Invalid operation mode. Please use one of {self.supported_reader}.") + raise Exception( + f"Invalid operation mode. Please use one of {self.supported_reader}." + ) - self.model_storage.json_set('args', args) + self.model_storage.json_set("args", args) index = self._setup_index(reader) - path = self.model_storage.folder_get('context') + path = self.model_storage.folder_get("context") index.storage_context.persist(persist_dir=path) - self.model_storage.folder_sync('context') + self.model_storage.folder_sync("context") def update(self, args) -> None: - prompt_template = args['using'].get('prompt_template', args.get('prompt_template', None)) + prompt_template = args["using"].get( + "prompt_template", args.get("prompt_template", None) + ) if prompt_template is not None: _validate_prompt_template(prompt_template) - args_cur = self.model_storage.json_get('args') - args_cur['using'].update(args['using']) + args_cur = self.model_storage.json_get("args") + args_cur["using"].update(args["using"]) # check new set of arguments self.create_validation(None, args_cur) - self.model_storage.json_set('args', args_cur) + self.model_storage.json_set("args", args_cur) - def predict(self, df: Optional[pd.DataFrame] = None, args: Optional[Dict] = None) -> pd.DataFrame: - pred_args = args['predict_params'] if args else {} + def predict( + self, df: Optional[pd.DataFrame] = None, args: Optional[Dict] = None + ) -> pd.DataFrame: + pred_args = args["predict_params"] if args else {} - args = self.model_storage.json_get('args') + args = self.model_storage.json_get("args") engine_kwargs = {} - if args['using'].get('mode') == 'conversational': - user_column = args['using']['user_column'] - assistant_column = args['using']['assistant_column'] + if args["using"].get("mode") == "conversational": + user_column = args["using"]["user_column"] + assistant_column = args["using"]["assistant_column"] messages = [] - for row in df[:-1].to_dict('records'): + for row in df[:-1].to_dict("records"): + messages.append(f"user: {row[user_column]}") + messages.append(f"assistant: {row[assistant_column]}") - messages.append(f'user: {row[user_column]}') - messages.append(f'assistant: {row[assistant_column]}') + conversation = "\n".join(messages) - conversation = '\n'.join(messages) + questions = [df.iloc[-1][user_column]] - questions = [ - df.iloc[-1][user_column] - ] - - if 'prompt' in pred_args and pred_args['prompt'] is not None: - user_prompt = pred_args['prompt'] + if "prompt" in pred_args and pred_args["prompt"] is not None: + user_prompt = pred_args["prompt"] else: - user_prompt = args['using'].get('prompt', '') - - prompt_template = f'{user_prompt}\n'\ - f'---------------------\n' \ - f'We have provided context information below. \n' \ - f'{{context_str}}\n' \ - f'---------------------\n' \ - f'This is previous conversation history:\n' \ - f'{conversation}\n' \ - f'---------------------\n' \ - f'Given this information, please answer the question: {{query_str}}' - - engine_kwargs['text_qa_template'] = QuestionAnswerPrompt(prompt_template) + user_prompt = args["using"].get("prompt", "") + + prompt_template = ( + f"{user_prompt}\n" + f"---------------------\n" + f"We have provided context information below. \n" + f"{{context_str}}\n" + f"---------------------\n" + f"This is previous conversation history:\n" + f"{conversation}\n" + f"---------------------\n" + f"Given this information, please answer the question: {{query_str}}" + ) + + engine_kwargs["text_qa_template"] = PromptTemplate(prompt_template) else: - input_column = args['using'].get('input_column', None) + input_column = args["using"].get("input_column", None) - prompt_template = args['using'].get('prompt_template', args.get('prompt_template', None)) + prompt_template = args["using"].get( + "prompt_template", args.get("prompt_template", None) + ) if prompt_template is not None: _validate_prompt_template(prompt_template) - engine_kwargs['text_qa_template'] = QuestionAnswerPrompt(prompt_template) + engine_kwargs["text_qa_template"] = PromptTemplate(prompt_template) if input_column is None: - raise Exception(f'`input_column` must be provided at model creation time or through USING clause when predicting. Please try again.') # noqa + raise Exception( + f"`input_column` must be provided at model creation time or through USING clause when predicting. Please try again." + ) # noqa if input_column not in df.columns: - raise Exception(f'Column "{input_column}" not found in input data! Please try again.') + raise Exception( + f'Column "{input_column}" not found in input data! Please try again.' + ) questions = df[input_column] - index_path = self.model_storage.folder_get('context') + index_path = self.model_storage.folder_get("context") storage_context = StorageContext.from_defaults(persist_dir=index_path) service_context = self._get_service_context() - index = load_index_from_storage(storage_context, service_context=service_context) + index = load_index_from_storage( + storage_context, service_context=service_context + ) query_engine = index.as_query_engine(**engine_kwargs) results = [] for question in questions: - query_results = query_engine.query(question) # TODO: provide extra_info in explain_target col + query_results = query_engine.query( + question + ) # TODO: provide extra_info in explain_target col results.append(query_results.response) - result_df = pd.DataFrame({'question': questions, args['target']: results}) # result_df['answer'].tolist() + result_df = pd.DataFrame( + {"question": questions, args["target"]: results} + ) # result_df['answer'].tolist() return result_df def _get_service_context(self): - args = self.model_storage.json_get('args') - openai_api_key = self._get_llama_index_api_key(args['using']) + args = self.model_storage.json_get("args") + engine_storage = self.engine_storage + + key = "OPENAI_API_KEY" + openai_api_key = get_api_key( + key, "llama_index", args["using"], engine_storage, strict=False + ) + if openai_api_key is None: + openai_api_key = get_api_key( + key.lower(), "llama_index", args["using"], engine_storage, strict=True + ) + openai.api_key = openai_api_key # TODO: shouldn't have to do this! bug? - llm_kwargs = { - 'openai_api_key': openai_api_key - } - if 'temperature' in args['using']: - llm_kwargs['temperature'] = args['using']['temperature'] - if 'model_name' in args['using']: - llm_kwargs['model_name'] = args['using']['model_name'] - if 'max_tokens' in args['using']: - llm_kwargs['max_tokens'] = args['using']['max_tokens'] + llm_kwargs = {"openai_api_key": openai_api_key} + if "temperature" in args["using"]: + llm_kwargs["temperature"] = args["using"]["temperature"] + if "model_name" in args["using"]: + llm_kwargs["model_name"] = args["using"]["model_name"] + if "max_tokens" in args["using"]: + llm_kwargs["max_tokens"] = args["using"]["max_tokens"] llm = OpenAI(**llm_kwargs) # TODO: all usual params should go here embed_model = OpenAIEmbedding(openai_api_key=openai_api_key) service_context = ServiceContext.from_defaults( - llm_predictor=LLMPredictor(llm=llm), - embed_model=embed_model + llm_predictor=LLMPredictor(llm=llm), embed_model=embed_model ) return service_context - + def _setup_index(self, documents): - args = self.model_storage.json_get('args') - indexer: VectorStore = getattr(llama_index, args['using']['index_class']) - index = indexer.from_documents(documents, service_context=self._get_service_context()) + args = self.model_storage.json_get("args") + indexer: VectorStore = getattr(llama_index, args["using"]["index_class"]) + index = indexer.from_documents( + documents, service_context=self._get_service_context() + ) return index - - def _get_llama_index_api_key(self, args, strict=True): - """ - API_KEY preference order: - 1. provided at model creation - 2. provided at engine creation - 3. OPENAI_API_KEY env variable - 4. llama_index.OPENAI_API_KEY setting in config.json - - Note: method is not case sensitive. - """ - key = 'OPENAI_API_KEY' - for k in key, key.lower(): - # 1 - if args.get(k): - return args[k] - # 2 - connection_args = self.engine_storage.get_connection_args() - if k in connection_args: - return connection_args[k] - # 3 - api_key = os.getenv(k) - if api_key is not None: - return api_key - # 4 - config = Config() - openai_cfg = config.get('llama_index', {}) - if k in openai_cfg: - return openai_cfg[k] - - if strict: - raise Exception(f'Missing API key "{k}". Either re-create this ML_ENGINE specifying the `{k}` parameter, or re-create this model and pass the API key with `USING` syntax.') # noqa
mindsdb/integrations/handlers/llama_index_handler/README.md+51 −1 modified@@ -8,7 +8,8 @@ LlamaIndex is a data framework for your LLM application. In this handler, we use - [x] LlamaIndex ML Handler - [x] [Support Web Page Reader](https://gpt-index.readthedocs.io/en/latest/examples/data_connectors/WebPageDemo.html) - [x] [Support Database Reader](https://gpt-index.readthedocs.io/en/latest/examples/data_connectors/DatabaseReaderDemo.html) - + - [x] [Support Github Reader](https://llamahub.ai/l/youtube_transcript?from=loaders) + - [x] [Support YoutubeTranscript Reader](https://llamahub.ai/l/youtube_transcript?from=loaders) **Note: To run YoutubeLoader `pip install youtube_transcript_api`** ## Example Usage @@ -56,3 +57,52 @@ JOIN files.question_table as t; ~~~~  + +## Example usage for GithubLoader +```sql +CREATE MODEL github_loader +PREDICT answer +USING + engine = 'llama_index', + index_class = 'VectorStoreIndex', + owner = 'mindsdb', + repo = 'mindsdb', + branch = 'staging', + reader = 'GithubRepositoryReader', + filter_type = 'include', + filter_file_extensions = ['.py','.html','.md'], + input_column = 'questions', + openai_api_key = '<your_openai_key>', + github_token = '<your_github_token>'; +``` + +```sql +SELECT a.questions, b.answer +FROM github_loader as b +JOIN files.questions as a +``` + +```sql +SELECT question, answer +FROM github_loader +WHERE questions = 'Explain steps to setup MindsDB on local machine?' +``` + +## Example usage for YoutubeTranscriptLoader +```sql +CREATE MODEL youtube_loader +PREDICT answer +USING + engine = 'llama_index', + index_class = 'VectorStoreIndex', + ytlinks = ['<link_of_youtube_videos>'], + reader = 'YoutubeTranscriptReader', + input_column = 'questions', + openai_api_key = '<your_openai_key>'; +``` + +```sql +SELECT question, answer +FROM youtube_loader +WHERE questions = 'What was the video about?' +``` \ No newline at end of file
mindsdb/integrations/handlers/llama_index_handler/requirements.txt+3 −2 modified@@ -1,2 +1,3 @@ -llama-index==0.8.57 -openai == 0.28.1 +llama-index==0.9.23 +llama-hub==0.0.62 +openai == 1.6.1
mindsdb/integrations/handlers/openai_handler/constants.py+2 −2 modified@@ -9,10 +9,10 @@ 'gpt-4-1106-preview', ) COMPLETION_MODELS = ('babbage-002', 'davinci-002') -FINETUNING_MODELS = ('gpt-3.5-turbo', 'babbage-002', 'davinci-002') +FINETUNING_MODELS = ('gpt-3.5-turbo', 'babbage-002', 'davinci-002','gpt-4') COMPLETION_LEGACY_BASE_MODELS = ('davinci', 'curie', 'babbage', 'ada') -FINETUNING_LEGACY_MODELS = COMPLETION_LEGACY_BASE_MODELS +FINETUNING_LEGACY_MODELS = FINETUNING_MODELS COMPLETION_LEGACY_MODELS = ( COMPLETION_LEGACY_BASE_MODELS + tuple(f'text-{model}-001' for model in COMPLETION_LEGACY_BASE_MODELS)
mindsdb/integrations/handlers/openai_handler/helpers.py+24 −23 modified@@ -1,22 +1,32 @@ import os -from typing import List, Optional +from typing import List import random import time import math import openai +from openai import OpenAI + import tiktoken import mindsdb.utilities.profiler as profiler from mindsdb.integrations.handlers.openai_handler.constants import OPENAI_API_BASE +class PendingFT(openai.OpenAIError): + message: str + def __init__(self, message) -> None: + super().__init__() + self.message = message + + def retry_with_exponential_backoff( initial_delay: float = 1, hour_budget: float = 0.3, jitter: bool = False, exponential_base: int = 2, - errors: tuple = (openai.error.RateLimitError, openai.error.APIConnectionError), + wait_errors: tuple = (openai.APITimeoutError, openai.APIConnectionError, PendingFT), + status_errors: tuple = (openai.APIStatusError, openai.APIResponseValidationError), ): """ Wrapper to enable optional arguments. It means this decorator always needs to be called with parenthesis: @@ -55,20 +65,13 @@ def wrapper(*args, **kwargs): while True: try: return func(*args, **kwargs) - except errors as e: - if e.error is not None: - if ( - e.error['type'] == 'invalid_request_error' - and 'Too many parallel completions' in e.error['message'] - or 'Please reduce the length of the messages' - in e.error['message'] - ): - raise e # InvalidRequestError triggers batched mode in the previous call - if e.error['type'] == 'insufficient_quota': - raise Exception( - 'API key has exceeded its quota, please try 1) increasing it or 2) using another key.' - ) # noqa + except status_errors as e: + raise Exception( + f'Error status {e.status_code} raised by OpenAI API: {e.body.get("message", "Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.")}' # noqa + ) # noqa + + except wait_errors: num_retries += 1 if num_retries > max_retries: raise Exception( @@ -78,12 +81,10 @@ def wrapper(*args, **kwargs): delay *= exponential_base * (1 + jitter * random.random()) time.sleep(delay) - except openai.error.OpenAIError as e: - if e.error is not None and e.error['type'] == 'insufficient_quota': - raise Exception( - 'API key has exceeded its quota, please try 1) increasing it or 2) using another key.' - ) # noqa - raise e + except openai.OpenAIError as e: + raise Exception( + f'General {str(e)} error raised by OpenAI. Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.' # noqa + ) except Exception as e: raise e @@ -146,6 +147,6 @@ def get_available_models(api_key: str) -> List[str]: """ api_base = os.environ.get('OPENAI_API_BASE', OPENAI_API_BASE) - res = openai.Model.list(api_key=api_key, api_base=api_base) + res = OpenAI(api_key=api_key, base_url=api_base).models.list() - return [models["id"] for models in res.data] + return [models.id for models in res.data]
mindsdb/integrations/handlers/openai_handler/openai_handler.py+92 −82 modified@@ -8,8 +8,8 @@ import subprocess import concurrent.futures from typing import Optional, Dict - import openai +from openai import OpenAI import numpy as np import pandas as pd @@ -20,11 +20,12 @@ retry_with_exponential_backoff, truncate_msgs_for_token_limit, get_available_models, + PendingFT, ) from mindsdb.integrations.handlers.openai_handler.constants import ( CHAT_MODELS, IMAGE_MODELS, - FINETUNING_LEGACY_MODELS, + FINETUNING_MODELS, OPENAI_API_BASE, ) from mindsdb.integrations.utilities.handler_utils import get_api_key @@ -55,8 +56,7 @@ def __init__(self, *args, **kwargs): self.max_batch_size = 20 self.default_max_tokens = 100 self.chat_completion_models = CHAT_MODELS - self.supported_ft_models = FINETUNING_LEGACY_MODELS # base models compatible with finetuning # TODO #7387: transition to new endpoint before 4/1/24. Useful reference: Anyscale handler. # noqa - self.ft_cls = openai.FineTune + self.supported_ft_models = FINETUNING_MODELS # base models compatible with finetuning # TODO #7387: transition to new endpoint before 4/1/24. Useful reference: Anyscale handler. # noqa @staticmethod def create_validation(target, args=None, **kwargs): @@ -121,6 +121,7 @@ def create_validation(target, args=None, **kwargs): "temperature", "api_key", "openai_api_key", + "api_organization", } ) @@ -162,6 +163,13 @@ def predict(self, df: pd.DataFrame, args: Optional[Dict] = None) -> pd.DataFrame pred_args = args['predict_params'] if args else {} args = self.model_storage.json_get('args') + args['api_base'] = pred_args.get( + 'api_base', + args.get( + 'api_base', os.environ.get('OPENAI_API_BASE', OPENAI_API_BASE) + )) + if pred_args.get('api_organization'): + args['api_organization'] = pred_args['api_organization'] df = df.reset_index(drop=True) if pred_args.get('mode'): @@ -259,12 +267,6 @@ def predict(self, df: pd.DataFrame, args: Optional[Dict] = None) -> pd.DataFrame 'best_of': pred_args.get('best_of', None), 'logit_bias': pred_args.get('logit_bias', None), 'user': pred_args.get('user', None), - 'api_base': pred_args.get( - 'api_base', - args.get( - 'api_base', os.environ.get('OPENAI_API_BASE', OPENAI_API_BASE) - ), - ), # noqa } if ( @@ -388,11 +390,9 @@ def _completion( """ @retry_with_exponential_backoff() - def _submit_completion(model_name, prompts, api_key, api_args, args, df): + def _submit_completion(model_name, prompts, api_args, args, df): kwargs = { 'model': model_name, - 'api_key': api_key, - 'organization': args.get('api_organization'), } if model_name in IMAGE_MODELS: return _submit_image_completion(kwargs, prompts, api_args) @@ -420,32 +420,32 @@ def _log_api_call(params, response): def _submit_normal_completion(kwargs, prompts, api_args): def _tidy(comp): tidy_comps = [] - for c in comp['choices']: - if 'text' in c: - tidy_comps.append(c['text'].strip('\n').strip('')) + for c in comp.choices: + if hasattr(c,'text'): + tidy_comps.append(c.text.strip('\n').strip('')) return tidy_comps kwargs['prompt'] = prompts kwargs = {**kwargs, **api_args} before_openai_query(kwargs) - resp = _tidy(openai.Completion.create(**kwargs)) + resp = _tidy(client.completions.create(**kwargs)) _log_api_call(kwargs, resp) return resp def _submit_embedding_completion(kwargs, prompts, api_args): def _tidy(comp): tidy_comps = [] - for c in comp['data']: - if 'embedding' in c: - tidy_comps.append([c['embedding']]) + for c in comp.data: + if hasattr(c,'embedding'): + tidy_comps.append([c.embedding]) return tidy_comps kwargs['input'] = prompts kwargs = {**kwargs, **api_args} before_openai_query(kwargs) - resp = _tidy(openai.Embedding.create(**kwargs)) + resp = _tidy(client.embeddings.create(**kwargs)) _log_api_call(kwargs, resp) return resp @@ -454,9 +454,9 @@ def _submit_chat_completion( ): def _tidy(comp): tidy_comps = [] - for c in comp['choices']: - if 'message' in c: - tidy_comps.append(c['message']['content'].strip('\n').strip('')) + for c in comp.choices: + if hasattr(c,'message'): + tidy_comps.append(c.message.content.strip('\n').strip('')) return tidy_comps completions = [] @@ -496,7 +496,7 @@ def _tidy(comp): pkwargs = {**kwargs, **api_args} before_openai_query(kwargs) - resp = _tidy(openai.ChatCompletion.create(**pkwargs)) + resp = _tidy(client.chat.completions.create(**pkwargs)) _log_api_call(pkwargs, resp) completions.extend(resp) @@ -505,7 +505,7 @@ def _tidy(comp): pkwargs = {**kwargs, **api_args} before_openai_query(kwargs) - resp = _tidy(openai.ChatCompletion.create(**pkwargs)) + resp = _tidy(client.chat.completions.create(**pkwargs)) _log_api_call(pkwargs, resp) completions.extend(resp) @@ -536,26 +536,31 @@ def _tidy(comp): def _submit_image_completion(kwargs, prompts, api_args): def _tidy(comp): return [ - c[0]['url'] if 'url' in c[0].keys() else c[0]['b64_json'] + c.url if hasattr(c,'url') else c.b64_json for c in comp ] completions = [ - openai.Image.create(**{'prompt': p, **kwargs, **api_args})['data'] + client.images.generate(**{'prompt': p, **kwargs, **api_args}).data[0] for p in prompts ] return _tidy(completions) + + client = self._get_client( + api_key=api_key, + base_url=args.get('api_base'), + org=args.pop('api_organization') if 'api_organization' in args else None, + ) try: # check if simple completion works completion = _submit_completion( - model_name, prompts, api_key, api_args, args, df + model_name, prompts, api_args, args, df ) return completion - except openai.error.InvalidRequestError as e: + except Exception as e: # else, we get the max batch size - e = e.user_message - if 'you can currently request up to at most a total of' in e: + if 'you can currently request up to at most a total of' in str(e): pattern = 'a total of' max_batch_size = int(e[e.find(pattern) + len(pattern) :].split(').')[0]) else: @@ -569,7 +574,6 @@ def _tidy(comp): partial = _submit_completion( model_name, prompts[i * max_batch_size : (i + 1) * max_batch_size], - api_key, api_args, args, df, @@ -591,7 +595,6 @@ def _tidy(comp): _submit_completion, model_name, prompts[i * max_batch_size : (i + 1) * max_batch_size], - api_key, api_args, args, df, @@ -610,14 +613,18 @@ def describe(self, attribute: Optional[str] = None) -> pd.DataFrame: # TODO: Update to use update() artifacts args = self.model_storage.json_get('args') - + api_key = get_api_key('openai', args, self.engine_storage) + client= self._get_client( + api_key=api_key, + base_url=args.get('api_base'), + org=args.get('api_organization') + ) if attribute == 'args': return pd.DataFrame(args.items(), columns=['key', 'value']) elif attribute == 'metadata': - api_key = get_api_key('openai', args, self.engine_storage) model_name = args.get('model_name', self.default_model) - meta = openai.Model.retrieve(model_name, api_key=api_key) - return pd.DataFrame(meta.items(), columns=['key', 'value']) + meta = client.models.retrieve(model_name) + return pd.DataFrame(dict(meta).items(), columns=['key', 'value']) else: tables = ['args', 'metadata'] return pd.DataFrame(tables, columns=['tables']) @@ -644,6 +651,11 @@ def finetune( using_args = args.pop('using') if 'using' in args else {} prompt_col = using_args.get('prompt_column', 'prompt') completion_col = using_args.get('completion_column', 'completion') + + api_key = get_api_key('openai', args, self.engine_storage) + api_base = using_args.get('api_base', os.environ['OPENAI_API_BASE']) + org = using_args.get('api_organization') + client = self._get_client(api_key=api_key, base_url=api_base, org=org) self._check_ft_cols(df, [prompt_col, completion_col]) @@ -655,10 +667,6 @@ def finetune( f"This model cannot be finetuned. Supported base models are {self.supported_ft_models}" ) - openai.api_key = get_api_key('openai', args, self.engine_storage) - openai.api_base = args.get( - 'api_base', os.environ.get('OPENAI_API_BASE', OPENAI_API_BASE) - ) finetune_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') temp_storage_path = tempfile.mkdtemp() @@ -672,18 +680,15 @@ def finetune( jsons = {k: None for k in file_names.keys()} for split, file_name in file_names.items(): if os.path.isfile(os.path.join(temp_storage_path, file_name)): - jsons[split] = openai.File.create( - file=open(f"{temp_storage_path}/{file_name}", "rb"), - # api_base=openai.api_base, # TODO: rm - purpose='fine-tune', - ) + jsons[split] = client.files.create(file=open(f"{temp_storage_path}/{file_name}", "rb"), + purpose='fine-tune') - if type(jsons['train']) in (openai.File, openai.openai_object.OpenAIObject): + if type(jsons['train']) is openai.types.FileObject: train_file_id = jsons['train'].id else: train_file_id = jsons['base'].id - if type(jsons['val']) in (openai.File, openai.openai_object.OpenAIObject): + if type(jsons['val']) is openai.types.FileObject: val_file_id = jsons['val'].id else: val_file_id = None @@ -698,31 +703,34 @@ def finetune( start_time = datetime.datetime.now() - ft_stats, result_file_id = self._ft_call(ft_params, args.get('hour_budget', 8)) - ft_model_name = ft_stats['fine_tuned_model'] + ft_stats, result_file_id = self._ft_call(ft_params, client, args.get('hour_budget', 8)) + ft_model_name = ft_stats.fine_tuned_model end_time = datetime.datetime.now() runtime = end_time - start_time - name_extension = openai.File.retrieve(id=result_file_id).filename + name_extension = client.files.retrieve(file_id=result_file_id).filename result_path = f'{temp_storage_path}/ft_{finetune_time}_result_{name_extension}' - with open(result_path, 'wb') as f: - f.write(openai.File.download(id=result_file_id)) - - if '.csv' in name_extension: - # legacy endpoint - train_stats = pd.read_csv(result_path) - if 'validation_token_accuracy' in train_stats.columns: - train_stats = train_stats[ - train_stats['validation_token_accuracy'].notnull() - ] - args['ft_api_info'] = ft_stats.to_dict_recursive() - args['ft_result_stats'] = train_stats.to_dict() - elif '.json' in name_extension: - train_stats = pd.read_json( - path_or_buf=result_path, lines=True - ) # new endpoint - args['ft_api_info'] = args['ft_result_stats'] = train_stats.to_dict() + try: + client.files.content(file_id=result_file_id).stream_to_file(result_path) + if '.csv' in name_extension: + # legacy endpoint + train_stats = pd.read_csv(result_path) + if 'validation_token_accuracy' in train_stats.columns: + train_stats = train_stats[ + train_stats['validation_token_accuracy'].notnull() + ] + args['ft_api_info'] = ft_stats.dict() + args['ft_result_stats'] = train_stats.to_dict() + + elif '.json' in name_extension: + train_stats = pd.read_json( + path_or_buf=result_path, lines=True + ) # new endpoint + args['ft_api_info'] = args['ft_result_stats'] = train_stats.to_dict() + + except Exception: + logger.info(f'Error retrieving fine-tuning results. Please check manually for information on job {ft_stats.id} (result file {result_file_id}).') args['model_name'] = ft_model_name args['runtime'] = runtime.total_seconds() @@ -768,12 +776,11 @@ def _prepare_ft_jsonl(self, df, _, temp_filename, temp_model_path): } return file_names - @staticmethod - def _get_ft_model_type(model_name: str): - for model_type in ['ada', 'curie', 'babbage', 'davinci']: + def _get_ft_model_type(self, model_name: str): + for model_type in self.supported_ft_models: if model_type in model_name.lower(): return model_type - return 'ada' + return 'babbage-002' @staticmethod def _add_extra_ft_params(ft_params, using_args): @@ -797,36 +804,39 @@ def _add_extra_ft_params(ft_params, using_args): } return {**ft_params, **extra_params} - def _ft_call(self, ft_params, hour_budget): + def _ft_call(self, ft_params, client, hour_budget): """ Separate method to account for both legacy and new endpoints. Currently, `OpenAIHandler` uses the legacy endpoint. Others, like `AnyscaleEndpointsHandler`, use the new endpoint. """ - ft_result = self.ft_cls.create( + ft_result = client.fine_tuning.jobs.create( **{k: v for k, v in ft_params.items() if v is not None} ) @retry_with_exponential_backoff( hour_budget=hour_budget, - errors=(openai.error.RateLimitError, openai.error.OpenAIError), ) def _check_ft_status(model_id): - ft_retrieved = self.ft_cls.retrieve(id=model_id) - if ft_retrieved['status'] in ('succeeded', 'failed', 'cancelled'): + ft_retrieved = client.fine_tuning.jobs.retrieve(fine_tuning_job_id=model_id) + if ft_retrieved.status in ('succeeded', 'failed', 'cancelled'): return ft_retrieved else: - raise openai.error.OpenAIError('Fine-tuning still pending!') + raise PendingFT('Fine-tuning still pending!') ft_stats = _check_ft_status(ft_result.id) - if ft_stats['status'] != 'succeeded': + if ft_stats.status != 'succeeded': raise Exception( - f"Fine-tuning did not complete successfully (status: {ft_stats['status']}). Error message: {ft_stats['events'][-1]['message']}" + f"Fine-tuning did not complete successfully (status: {ft_stats.status}). Error message: {ft_stats.events[-1].message}" ) # noqa - result_file_id = self.ft_cls.retrieve(id=ft_result.id)['result_files'][0] + result_file_id = client.fine_tuning.jobs.retrieve(fine_tuning_job_id=ft_result.id).result_files[0] if hasattr(result_file_id, 'id'): result_file_id = result_file_id.id # legacy endpoint return ft_stats, result_file_id + + @staticmethod + def _get_client(api_key, base_url=OPENAI_API_BASE, org=None): + return OpenAI(api_key=api_key, base_url=base_url, organization=org)
mindsdb/integrations/handlers/openai_handler/requirements.txt+1 −1 modified@@ -1,2 +1,2 @@ -openai == 0.28.1 +openai == 1.6.1 tiktoken >= 0.3.0
mindsdb/integrations/handlers/rag_handler/requirements.txt+1 −2 modified@@ -1,6 +1,5 @@ -r mindsdb/integrations/handlers/chromadb_handler/requirements.txt -openai == 0.28.1 -pydantic>=1.10.8 +openai==1.6.1 html2text writerai~=1.1.0 pydantic
mindsdb/integrations/handlers/rag_handler/settings.py+4 −4 modified@@ -108,9 +108,9 @@ def get_available_openai_model_ids(args: dict) -> list: openai.api_key = args["openai_api_key"] - res = openai.Engine.list() + models = openai.OpenAI().models.list().data - return [models["id"] for models in res.data] + return [models.id for models in models] @dataclass @@ -266,12 +266,12 @@ def load_writer_llm(self) -> Writer: def load_openai_llm(self) -> partial: """Load OpenAI LLM API interface""" - openai.api_key = self.config_dict["openai_api_key"] + client = openai.OpenAI(api_key=self.config_dict["openai_api_key"]) config = self.config_dict.copy() config.pop("openai_api_key") config["model"] = config.pop("model_id") - return partial(openai.Completion.create, **config) + return partial(client.completions.create, **config) class RAGBaseParameters(BaseModel):
tests/handler_tests/test_instatus_handler.py+56 −9 modified@@ -11,6 +11,10 @@ class InstatusHandlerTest(unittest.TestCase): def setUpClass(cls): cls.handler = InstatusHandler(name='mindsdb_instatus', connection_data={'api_key': os.environ.get('INSTATUS_API_KEY')}) + def setUp(self): + self.pageId = self.handler.call_instatus_api(endpoint='/v2/pages')['id'][0] + self.componentId = self.handler.call_instatus_api(endpoint=f'/v1/{self.pageId}/components')['id'][0] + def test_0_check_connection(self): assert self.handler.check_connection() @@ -22,21 +26,23 @@ def test_2_get_tables(self): assert tables.type is not RESPONSE_TYPE.ERROR def test_3_get_columns(self): - columns = self.handler.get_columns(table_name='status_pages') - assert type(columns) is not RESPONSE_TYPE.ERROR + status_pages_columns = self.handler.get_columns(table_name='status_pages') + components_columns = self.handler.get_columns(table_name='components') + assert type(status_pages_columns) is not RESPONSE_TYPE.ERROR + assert type(components_columns) is not RESPONSE_TYPE.ERROR - def test_4_select(self): + def test_4_select_status_pages(self): query = '''SELECT * FROM mindsdb_instatus.status_pages''' self.assertTrue(self.handler.native_query(query)) - def test_5_select_by_conditions(self): + def test_5_select_status_pages_by_conditions(self): query = '''SELECT name, status, subdomain FROM mindsdb_instatus.status_pages WHERE id = "clo3xshsk1114842hkn377y3lrap"''' self.assertTrue(self.handler.native_query(query)) - def test_6_insert(self): + def test_6_insert_status_pages(self): query = f'''INSERT INTO mindsdb_instatus.status_pages (email, name, subdomain, components, logoUrl) VALUES ('{os.environ.get('EMAIL_ID')}', 'mindsdb', 'somtirtha-roy', '["Website", "App", "API"]', 'https://instatus.com/sample.png')''' try: self.assertTrue(self.handler.native_query(query)) @@ -45,9 +51,7 @@ def test_6_insert(self): if "This subdomain is taken by another status page" in error_message: print("Subdomain is already taken. Choose a different one.") - def test_7_update(self): - # get the id of the row to be updated - _id = self.handler.call_instatus_api(endpoint='/v2/pages')['id'][0] + def test_7_update_status_pages(self): # update the row with the id obtained query = f'''UPDATE mindsdb_instatus.status_pages SET logoUrl = 'https://instatus.com/sample.png', @@ -89,7 +93,50 @@ def test_7_update(self): "fr": "nasa" }} }}' - WHERE id = "{_id}"''' + WHERE id = "{self.pageId}"''' + self.assertTrue(self.handler.native_query(query)) + + def test_8_select_components(self): + query = f'''SELECT * + FROM mindsdb_instatus.components + WHERE page_id = '{self.pageId}';''' + self.assertTrue(self.handler.native_query(query)) + + def test_9_select_components_by_conditions(self): + query = f'''SELECT * + FROM mindsdb_instatus.components + WHERE page_id = '{self.pageId}' + AND component_id = '{self.componentId}';''' + self.assertTrue(self.handler.native_query(query)) + + def test_10_insert_components(self): + query = f'''INSERT INTO mindsdb_instatus.components (page_id, name, description, status, order, showUptime, grouped, translations_name_in_fr, translations_desc_in_fr) + VALUES ( + '{self.pageId}', + 'Test component', + 'Testing', + 'OPERATIONAL', + 6, + true, + false, + "Composant de test", + "En test" + );''' + self.assertTrue(self.handler.native_query(query)) + + def test_11_update_components(self): + query = f'''UPDATE mindsdb_instatus.components + SET + name = 'Test component 4', + description = 'Test test test', + status = 'OPERATIONAL', + order = 6, + showUptime = true, + grouped = false, + translations_name_in_fr = "Composant de test 4", + translations_desc_in_fr = "Test test test" + WHERE page_id = '{self.pageId}' + AND component_id = '{self.componentId}';''' self.assertTrue(self.handler.native_query(query))
tests/unit/ml_handlers/test_langchain_embedding.py+3 −3 modified@@ -180,7 +180,7 @@ def test_no_input_columns(self, mock_handler): "content2": ["world", "hello", "bar", "foo"], } ) - self.set_handler(mock_handler, name="pg", tables={"df": df}) + self.save_file("df", df) # create the model with no input columns specified should use # all columns when embedding the documents @@ -217,7 +217,7 @@ class = 'fake', -- a more user friendly name ret = self.run_sql( """ SELECT * FROM proj.test_dummy_no_input_columns - JOIN pg.df + JOIN files.df """ ) @@ -231,7 +231,7 @@ class = 'fake', -- a more user friendly name ret = self.run_sql( """ CREATE MODEL proj.test_dummy_no_input_columns_from_df - FROM pg ( + FROM files ( SELECT *, NULL as embeddings FROM df -- this requires an empty column called embeddings ) PREDICT embeddings
tests/unit/ml_handlers/test_openai.py+6 −1 modified@@ -7,8 +7,10 @@ from mindsdb.integrations.handlers.openai_handler.openai_handler import OpenAIHandler from ..executor_test_base import BaseExecutorTest +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") -@pytest.mark.skipif(os.environ.get('OPENAI_API_KEY') is None, reason='Missing API key!') + +@pytest.mark.skipif(OPENAI_API_KEY is None, reason='Missing API key!') class TestOpenAI(BaseExecutorTest): """Test Class for OpenAI Integration Testing""" @@ -225,6 +227,9 @@ class MockHandlerStorage: def json_get(self, key): return {'ft-suffix': {'ft-suffix': '$'}}[key] # finetuning suffix, irrelevant for this test but needed for init # noqa + def get_connection_args(self): + return {'api_key': OPENAI_API_KEY} # noqa + # create project handler = OpenAIHandler( model_storage=None, # the storage does not matter for this test
Vulnerability mechanics
Generated by null/stub on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.
References
4- github.com/advisories/GHSA-4jcv-vp96-94xrghsaADVISORY
- nvd.nist.gov/vuln/detail/CVE-2024-24759ghsaADVISORY
- github.com/mindsdb/mindsdb/commit/5f7496481bd3db1d06a2d2e62c0dce960a1fe12bghsax_refsource_MISCWEB
- github.com/mindsdb/mindsdb/security/advisories/GHSA-4jcv-vp96-94xrghsax_refsource_CONFIRMWEB
News mentions
0No linked articles in our index yet.