NVIDIA · tomer-levin-nv · Apr 5, 2026 · Apr 5, 2026 · Mar 31, 2026 · Mar 31, 2026
@@ -104,6 +104,9 @@ stores = [
   "duckdb>=1.2.0",
   "duckdb-engine>=0.13.0",
   "neo4j>=5.0",
+  "langchain-nvidia-ai-endpoints>=0.3.0",
+  "langchain-community>=0.4.0",
+  "langgraph>=1.1.0",
 ]
 
 # BEIR benchmarking and evaluation tools (not needed for production use).

@@ -18,6 +18,8 @@
 
 logger = logging.getLogger(__name__)
 
+conn = get_neo4j_conn()
+
 
 def load_schema_from_graph(
     db_name,

@@ -35,6 +35,30 @@ def query_neo4j_tables_for_embedding() -> List[dict]:
     return result[0].get("docs") or []
 
 
+def query_neo4j_columns_for_embedding() -> List[dict]:
+    """Return one doc per ``Column`` node for embedding (distinct from table-level rows)."""
+    neo4j_conn = get_neo4j_conn()
+    query = f"""
+               MATCH (d:{Labels.DB})-[:{Edges.CONTAINS}]->(s:{Labels.SCHEMA})
+               -[:{Edges.CONTAINS}]->(t:{Labels.TABLE})
+               -[:{Edges.CONTAINS}]->(c:{Labels.COLUMN})
+               RETURN collect({{
+                 text: "db_name: " + d.name + ", schema_name: " + s.name +
+                   ", table_name: " + t.name + ", column_name: " + c.name +
+                   ", data_type: " + coalesce(toString(c.data_type), "") +
+                   CASE WHEN c.description IS NOT NULL AND trim(toString(c.description)) <> ""
+                     THEN ", column_description: " + toString(c.description) ELSE "" END,
+                 name: c.name,
+                 label: labels(c)[0],
+                 id: c.id
+               }}) as docs
+            """
+    result = neo4j_conn.query_read(query, parameters={})
+    if not result:
+        return []
+    return result[0].get("docs") or []
+
+
 def fetch_tabular_embedding_dataframe() -> pd.DataFrame:
     """Fetch all tabular entity docs from Neo4j and return a DataFrame ready for embedding.
 
@@ -43,7 +67,9 @@ def fetch_tabular_embedding_dataframe() -> pd.DataFrame:
     unstructured pipeline so run_pipeline_tasks_on_df works without changes.
     """
     _empty = pd.DataFrame(columns=["text", "_embed_modality", "path", "page_number", "metadata"])
-    docs = query_neo4j_tables_for_embedding()
+    table_docs = query_neo4j_tables_for_embedding()
+    column_docs = query_neo4j_columns_for_embedding()
+    docs = list(table_docs) + list(column_docs)
     if not docs:
         return _empty
 

@@ -41,6 +41,7 @@ def parse_query_slim(sql_text: str, query_obj: Query, dialect: str, schemas: dic
     if not table_matches:
         return False
 
+    column_ids: list[str] = []
     for table_key, match in table_matches.items():
         # table_key may be "schema.table" or just "table"; bare name is always the last part.
         bare_name = table_key.split(".")[-1]
@@ -72,6 +73,7 @@ def parse_query_slim(sql_text: str, query_obj: Query, dialect: str, schemas: dic
             try:
                 if schema.is_column_in_table(table_node, col_name):
                     col_node = schema.get_column_node(col_name, bare_name)
+                    column_ids.append(str(col_node.id))
                     query_obj.edges.append((query_obj.sql_node, col_node, edge_props))
             except Exception:
                 continue

@@ -8,6 +8,12 @@
 import pandas as pd
 
 
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+
+
 def flat_list_recursive(nested_list):
     output = []
     for i in nested_list:
@@ -51,12 +57,6 @@ def remove_redundant_parentheses(text):
     return text
 
 
-def chunks(lst, n):
-    """Yield successive n-sized chunks from lst."""
-    for i in range(0, len(lst), n):
-        yield lst[i : i + n]
-
-
 def normalize_tables(df: pd.DataFrame) -> pd.DataFrame:
     """Normalize and type a tables DataFrame. Expects a DataFrame only."""
     types = {