11import os
22import time
33import webbrowser
4- from typing import List , Optional , Dict
4+ from typing import List , Optional , Dict , Tuple , Union
55import keyring
6-
76import pydantic
87import rich
98from rich .prompt import Confirm
109
10+ from data_diff .errors import DataDiffCustomSchemaNoConfigError , DataDiffDbtProjectVarsNotFoundError
11+
1112from . import connect_to_table , diff_tables , Algorithm
1213from .cloud import DatafoldAPI , TCloudApiDataDiff , TCloudApiOrgMeta , get_or_create_data_source
13- from .dbt_parser import DbtParser , PROJECT_FILE
14+ from .dbt_parser import DbtParser , PROJECT_FILE , TDatadiffConfig
1415from .tracking import (
1516 set_entrypoint_name ,
1617 set_dbt_user_id ,
@@ -52,24 +53,23 @@ def dbt_diff(
5253 project_dir_override : Optional [str ] = None ,
5354 is_cloud : bool = False ,
5455 dbt_selection : Optional [str ] = None ,
56+ state : Optional [str ] = None ,
5557) -> None :
5658 print_version_info ()
5759 diff_threads = []
5860 set_entrypoint_name ("CLI-dbt" )
59- dbt_parser = DbtParser (profiles_dir_override , project_dir_override )
61+ dbt_parser = DbtParser (profiles_dir_override , project_dir_override , state )
6062 models = dbt_parser .get_models (dbt_selection )
61- datadiff_variables = dbt_parser .get_datadiff_variables ()
62- config_prod_database = datadiff_variables .get ("prod_database" )
63- config_prod_schema = datadiff_variables .get ("prod_schema" )
64- config_prod_custom_schema = datadiff_variables .get ("prod_custom_schema" )
65- datasource_id = datadiff_variables .get ("datasource_id" )
63+ config = dbt_parser .get_datadiff_config ()
64+
6665 set_dbt_user_id (dbt_parser .dbt_user_id )
6766 set_dbt_version (dbt_parser .dbt_version )
6867 set_dbt_project_id (dbt_parser .dbt_project_id )
6968
70- if datadiff_variables .get ("custom_schemas" ) is not None :
71- logger .warning (
72- "vars: data_diff: custom_schemas: is no longer used and can be removed.\n To utilize custom schemas, see the documentation here: https://docs.datafold.com/development_testing/open_source"
69+ if not state and not (config .prod_database or config .prod_schema ):
70+ doc_url = "https://docs.datafold.com/development_testing/open_source#configure-your-dbt-project"
71+ raise DataDiffDbtProjectVarsNotFoundError (
72+ f"""vars: data_diff: section not found in dbt_project.yml.\n \n To solve this, please configure your dbt project: \n { doc_url } \n \n Or specify a production manifest using the `--state` flag."""
7373 )
7474
7575 if is_cloud :
@@ -79,13 +79,13 @@ def dbt_diff(
7979 return
8080 org_meta = api .get_org_meta ()
8181
82- if datasource_id is None :
82+ if config . datasource_id is None :
8383 rich .print ("[red]Data source ID not found in dbt_project.yml" )
8484 is_create_data_source = Confirm .ask ("Would you like to create a new data source?" )
8585 if is_create_data_source :
86- datasource_id = get_or_create_data_source (api = api , dbt_parser = dbt_parser )
86+ config . datasource_id = get_or_create_data_source (api = api , dbt_parser = dbt_parser )
8787 rich .print (f'To use the data source in next runs, please, update your "{ PROJECT_FILE } " with a block:' )
88- rich .print (f"[green]vars:\n data_diff:\n datasource_id: { datasource_id } \n " )
88+ rich .print (f"[green]vars:\n data_diff:\n datasource_id: { config . datasource_id } \n " )
8989 rich .print (
9090 "Read more about Datafold vars in docs: "
9191 "https://docs.datafold.com/os_diff/dbt_integration/#configure-a-data-source\n "
@@ -96,21 +96,29 @@ def dbt_diff(
9696 "\n vars:\n data_diff:\n datasource_id: 1234"
9797 )
9898
99- data_source = api .get_data_source (datasource_id )
99+ data_source = api .get_data_source (config . datasource_id )
100100 dbt_parser .set_casing_policy_for (connection_type = data_source .type )
101101 rich .print ("[green][bold]\n Diffs in progress...[/][/]\n " )
102102
103103 else :
104104 dbt_parser .set_connection ()
105105
106106 for model in models :
107- diff_vars = _get_diff_vars (
108- dbt_parser , config_prod_database , config_prod_schema , config_prod_custom_schema , model
109- )
107+ diff_vars = _get_diff_vars (dbt_parser , config , model )
108+
109+ # we won't always have a prod path when using state
110+ # when the model DNE in prod manifest, skip the model diff
111+ if (
112+ state and len (diff_vars .prod_path ) < 2
113+ ): # < 2 because some providers like databricks can legitimately have *only* 2
114+ diff_output_str = _diff_output_base ("." .join (diff_vars .dev_path ), "." .join (diff_vars .prod_path ))
115+ diff_output_str += "[green]New model: nothing to diff![/] \n "
116+ rich .print (diff_output_str )
117+ continue
110118
111119 if diff_vars .primary_keys :
112120 if is_cloud :
113- diff_thread = run_as_daemon (_cloud_diff , diff_vars , datasource_id , api , org_meta )
121+ diff_thread = run_as_daemon (_cloud_diff , diff_vars , config . datasource_id , api , org_meta )
114122 diff_threads .append (diff_thread )
115123 else :
116124 _local_diff (diff_vars )
@@ -128,41 +136,19 @@ def dbt_diff(
128136
129137def _get_diff_vars (
130138 dbt_parser : "DbtParser" ,
131- config_prod_database : Optional [str ],
132- config_prod_schema : Optional [str ],
133- config_prod_custom_schema : Optional [str ],
139+ config : TDatadiffConfig ,
134140 model ,
135141) -> TDiffVars :
136142 dev_database = model .database
137143 dev_schema = model .schema_
138144
139145 primary_keys = dbt_parser .get_pk_from_model (model , dbt_parser .unique_columns , "primary-key" )
140146
141- # "custom" dbt config database
142- if model .config .database :
143- prod_database = model .config .database
144- elif config_prod_database :
145- prod_database = config_prod_database
147+ # prod path is constructed via configuration or the prod manifest via --state
148+ if dbt_parser .prod_manifest_obj :
149+ prod_database , prod_schema = _get_prod_path_from_manifest (model , dbt_parser .prod_manifest_obj )
146150 else :
147- prod_database = dev_database
148-
149- # prod schema name differs from dev schema name
150- if config_prod_schema :
151- custom_schema = model .config .schema_
152-
153- # the model has a custom schema config(schema='some_schema')
154- if custom_schema :
155- if not config_prod_custom_schema :
156- raise ValueError (
157- f"Found a custom schema on model { model .name } , but no value for\n vars:\n data_diff:\n prod_custom_schema:\n Please set a value!\n "
158- + "For more details see: https://docs.datafold.com/development_testing/open_source"
159- )
160- prod_schema = config_prod_custom_schema .replace ("<custom_schema>" , custom_schema )
161- # no custom schema, use the default
162- else :
163- prod_schema = config_prod_schema
164- else :
165- prod_schema = dev_schema
151+ prod_database , prod_schema = _get_prod_path_from_config (config , model , dev_database , dev_schema )
166152
167153 if dbt_parser .requires_upper :
168154 dev_qualified_list = [x .upper () for x in [dev_database , dev_schema , model .alias ] if x ]
@@ -186,6 +172,45 @@ def _get_diff_vars(
186172 )
187173
188174
175+ def _get_prod_path_from_config (config , model , dev_database , dev_schema ) -> Tuple [str , str ]:
176+ # "custom" dbt config database
177+ if model .config .database :
178+ prod_database = model .config .database
179+ elif config .prod_database :
180+ prod_database = config .prod_database
181+ else :
182+ prod_database = dev_database
183+
184+ # prod schema name differs from dev schema name
185+ if config .prod_schema :
186+ custom_schema = model .config .schema_
187+
188+ # the model has a custom schema config(schema='some_schema')
189+ if custom_schema :
190+ if not config .prod_custom_schema :
191+ raise DataDiffCustomSchemaNoConfigError (
192+ f"Found a custom schema on model { model .name } , but no value for\n vars:\n data_diff:\n prod_custom_schema:\n Please set a value or utilize the `--state` flag!\n \n "
193+ + "For more details see: https://docs.datafold.com/development_testing/open_source"
194+ )
195+ prod_schema = config .prod_custom_schema .replace ("<custom_schema>" , custom_schema )
196+ # no custom schema, use the default
197+ else :
198+ prod_schema = config .prod_schema
199+ else :
200+ prod_schema = dev_schema
201+ return prod_database , prod_schema
202+
203+
204+ def _get_prod_path_from_manifest (model , prod_manifest ) -> Union [Tuple [str , str ], Tuple [None , None ]]:
205+ prod_database = None
206+ prod_schema = None
207+ prod_model = prod_manifest .nodes .get (model .unique_id , None )
208+ if prod_model :
209+ prod_database = prod_model .database
210+ prod_schema = prod_model .schema_
211+ return prod_database , prod_schema
212+
213+
189214def _local_diff (diff_vars : TDiffVars ) -> None :
190215 dev_qualified_str = "." .join (diff_vars .dev_path )
191216 prod_qualified_str = "." .join (diff_vars .prod_path )
0 commit comments