How to build raw interactions table using Data API

View as Markdown

This document describes the process of reconstructing the Raw Interactions table using data retrieved from the Data API.

The Data API provides raw data in a normalized format, split across multiple related tables. To recreate the unified Raw Interactions view, these tables must be joined and transformed to align with the structure and semantics of the original dataset.

In this exercise, we download the raw data into an Excel file and perform the necessary transformations within Excel to build the final view.

Step - 1 : Fetching and storing data into CSV files.

The Data API distributes raw data across five key tables: Conversations, Interactions, Plugin Calls, Plugin Resources, and Users.

We use a Python script to fetch data from the past 24 hours and generate an Excel file containing all five tables.

You can copy the script into your environment and execute it. Be sure to replace the placeholder for the API key with your actual Data API credentials before running the script.

Python script

1import requests
2import pandas as pd
3import time
4import logging
5from pathlib import Path
6import re
7
8# Config
9BASE_API = 'https://api.moveworks.ai/export/v1beta2/records'
10ENDPOINTS = [
11 'interactions',
12 'conversations',
13 'plugin-calls',
14 'plugin-resources',
15 'users'
16]
17ACCESS_TOKEN = '<Access Token>' # Replace this
18
19# Filter time window (applied to all endpoints)
20TIME_FILTER = "last_updated_time ge '2025-07-15T00:00:00.000Z' and last_updated_time le '2025-07-16T00:00:00.000Z'"
21
22# Setup logging
23logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
25# Headers
26headers = {
27 'Authorization': f'Bearer {ACCESS_TOKEN}',
28}
29
30# Retry config
31MAX_RETRIES = 5
32
33# Function to clean illegal characters
34def clean_illegal_chars(df):
35 def remove_illegal_chars(value):
36 if isinstance(value, str):
37 return re.sub(r'[�- -]', '', value)
38 return value
39 return df.applymap(remove_illegal_chars)
40
41# Flatten nested fields
42def flatten_record(record):
43 flat = record.copy()
44 if isinstance(flat.get("detail"), dict):
45 detail = flat.pop("detail", {}) or {}
46 for key, value in detail.items():
47 if isinstance(value, list):
48 value = ','.join(map(str, value))
49 flat[f'detail_{key}'] = value
50 return flat
51
52# Collect data per endpoint
53all_dataframes = {}
54
55def fetch_data(endpoint):
56 url = f'{BASE_API}/{endpoint}'
57 params = {
58 '$orderby': 'id desc',
59 '$filter': TIME_FILTER,
60 }
61
62 data = []
63 retries = 0
64 page_count = 0
65
66 logging.info(f"Fetching data from /{endpoint}")
67
68 while url:
69 try:
70 logging.info(f"Requesting page {page_count + 1}: {url}")
71 response = requests.get(url, headers=headers, params=params if url.endswith(endpoint) else None)
72
73 if response.status_code == 200:
74 json_resp = response.json()
75 page_data = json_resp.get('value', [])
76 logging.info(f"Retrieved {len(page_data)} records from page {page_count + 1}")
77
78 if not page_data:
79 break
80
81 for record in page_data:
82 data.append(flatten_record(record))
83
84 url = json_resp.get('@odata.nextLink')
85 retries = 0
86 page_count += 1
87
88 if endpoint == 'users':
89 time.sleep(2)
90
91 elif response.status_code == 429:
92 wait = 90 if endpoint == 'users' else 60
93 logging.warning(f"Rate limited on /{endpoint}. Waiting for {wait} seconds.")
94 time.sleep(wait)
95
96 elif response.status_code in (500, 502, 503, 504):
97 if retries < MAX_RETRIES:
98 wait = 2 ** retries
99 logging.warning(f"Transient error {response.status_code} on /{endpoint}. Retrying in {wait} seconds...")
100 time.sleep(wait)
101 retries += 1
102 else:
103 logging.error(f"Max retries reached on /{endpoint}. Aborting.")
104 break
105
106 else:
107 logging.error(f"Unexpected error {response.status_code} on /{endpoint}: {response.text}")
108 break
109
110 except Exception as e:
111 logging.exception(f"Exception occurred while calling /{endpoint}")
112 break
113
114 if data:
115 df = pd.DataFrame(data)
116 df = clean_illegal_chars(df)
117 all_dataframes[endpoint] = df
118 logging.info(f"Ingestion complete for /{endpoint}, {len(df)} records.")
119 else:
120 logging.info(f"No data retrieved from /{endpoint}.")
121
122# Run ingestion for all endpoints
123for endpoint in ENDPOINTS:
124 fetch_data(endpoint)
125
126# Save to Excel with one sheet per endpoint
127if all_dataframes:
128 output_file = Path("moveworks_export.xlsx")
129 with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
130 for endpoint, df in all_dataframes.items():
131 sheet_name = endpoint[:31] # Excel sheet name max limit
132 df.to_excel(writer, sheet_name=sheet_name, index=False)
133 logging.info(f"All data written to {output_file}")
134else:
135 logging.warning("No data to write to Excel.")

Step - 2 : Modification of the excel file to generate the raw interactions view.

Let’s learn the sequence of columns shown in raw interactions table and understand where this data is stored in Data API

Raw interactions table

Definition

Data API table

Timestamp

This the created time for user interaction

Interactions

Conversation ID

Conversation ID attributed to an interaction stored

Interactions

Conversation Domain

This is detected domain on a conversation level

Conversations

Conversation Topic

All topics aggregated for interactions based on the conversation ID

Interactions

Interaction Type

Type of interaction

Interactions

Interaction content

User message or action information

Interactions

Bot Response

Summarized response provided by the AI Assistant

Interactions

Unsuccessful Plugins

Plugins called by the AI Assistant but were not served and not used

Plugin calls

Plugin Served

Plugins that the reasoner considered and served to the user

Plugin calls

Plugin Used

Plugins that the user interacted or engaged with

Plugin calls

Resource Domain

Resource domain configured during setting up the ingestion.

Plugin resource

No of citations

Total no of citations provided in the summarized response

Plugin resource

Content Item name

Name of the content item used to generate the summarized response

Plugin resource

Content Item ID

External resource ID of the content item

Plugin resource

Ticket type

Indicates who filed the ticket

  1. user initiated ticket : ticket filed by the end user as an escalation
  2. bot initiated ticket : ticket filed by the AI Assistant as a receipt

Plugin resource

Ticket ID

Unique identifier for a ticket

Plugin resource

Interaction surface

Platform where the interaction took place

Interactions

User Department

Department to which the end user belongs to


User Location

Location to which the end user belongs to


User country

Country to which the end user belongs to


User Preferred language

Default language configured in the AI Assistant

Users

Now that we have identified the columns required to reconstruct the Raw Interactions view, let’s begin working through the raw data and progressively merge the interactions and conversations tables to form a unified view. Please note that we will be using Excel functions throughout this process.

  1. Bring in the base columns — Actor, Conversation ID, Created Time, Interaction ID, Interaction Type

    a. From the interactions sheet, extract the relevant details into a new sheet. We are only interested in user-led interactions.

    Use the following formula to filter this data:

    =FILTER(interactions!A:D, interactions!A:A="user")

  2. Using the Conversation ID, fetch Conversation Domain and Conversation Topic

    a. To fetch the Conversation Domain, use the VLOOKUP function to pull data from the conversations sheet:

    =VLOOKUP(B4,conversations!B:E,3,FALSE)

    b. To recreate the Conversation Topic, aggregate all topics detected at the interaction level and replicate them for each interaction.

    Use this formula:

    =TEXTJOIN(", ", TRUE, UNIQUE(FILTER(interactions!P:P, interactions!B:B= B2)))

  3. Fetch additional content fields from the Interactions table

    a. To extract Interaction Content, pull values from detail_content, detail_detail, or similar columns, and combine them based on data availability. Use the following nested IF structure:

    =IF(INDEX(interactions!K:K, MATCH(D1294, interactions!D:D, 0))<>"", INDEX(interactions!K:K, MATCH(D1294, interactions!D:D, 0)), IF(INDEX(interactions!N:N, MATCH(D1294, interactions!D:D, 0))<>"", INDEX(interactions!N:N, MATCH(D1294, interactions!D:D, 0)), IF(INDEX(interactions!L:L, MATCH(D1294, interactions!D:D, 0))<>"", INDEX(interactions!L:L, MATCH(D1294, interactions!D:D, 0)), IF(INDEX(interactions!Q:Q, MATCH(D1294, interactions!D:D, 0))<>"", INDEX(interactions!Q:Q, MATCH(D1294, interactions!D:D, 0)), ""))))

    b. To retrieve the Bot Response corresponding to each user interaction, use the parent interaction ID to index into bot-led interactions:

    =TEXTJOIN(CHAR(10), TRUE, FILTER(interactions!K:K, interactions!G:G=D2))

    There can be multiple bot interactions pointing to a single user interaction. Join these interactions to form a bot response similar to raw interactions table.

  4. Determine plugins involved in an interaction using the Plugin Calls table

    Retrieve three plugin-related columns using the formulas below:

    a. Unsuccessful plugin:

    =TEXTJOIN(", ", TRUE,FILTER('plugin-calls'!F:F,('plugin-calls'!D:D=D10) *('plugin-calls'!H:H =FALSE) *('plugin-calls'!I:I=FALSE)))

    b. Plugin served:

    =TEXTJOIN(", ", TRUE,FILTER('plugin-calls'!F:F,('plugin-calls'!D:D=D10) *('plugin-calls'!H:H =TRUE)))

    c. Plugin used:

    =TEXTJOIN(", ", TRUE,FILTER('plugin-calls'!F:F,('plugin-calls'!D:D=D10) *('plugin-calls'!H:H =TRUE) *('plugin-calls'!I:I=TRUE)))

  5. Add plugin resource information

    Use the plugin-resources table to fetch citation and content-related metadata:

    a. Resource Domain: Indicates the configured domain for knowledge or ticket-based interactions.

    =TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!K:K, 'plugin-resources'!E:E = D2)))

    b. Number of Citations: Counts distinct resources cited per interaction.

    =IFERROR(COUNTA(UNIQUE(FILTER('plugin-resources'!D:D, 'plugin-resources'!E:E = D2))), 0)

    Note: Even when there’s a #CALC! error, Excel may return 1. Use the Content Item ID column to clean up any incorrect entries.

    c. Content Item ID:

    =TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!D:D, 'plugin-resources'!E:E = D2)))

    Filter out rows returning #CALC!. For such rows, mark No of Citations as 1 and Content Item ID as “Nil”.

    d. Content Item Name:

    =TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!M:M, 'plugin-resources'!E:E = D2)))

  6. Populate ticketing information (only applicable when a ticket is created)

    a. Ticket Type: Indicates whether a ticket was user-initiated or bot-initiated.

    =TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!S:S, 'plugin-resources'!E:E = D2)))

    b. Ticket ID: Filter the values to include only rows where the resource type is “ticket”.

    =TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!H:H,('plugin-resources'!E:E = D4)*('plugin-resources'!I:I="RESOURCE_TYPE_TICKET"))))

  7. Fetch the platform where the interaction took place

    This can be extracted from the interactions sheet based on Interaction ID:

    =TEXTJOIN(", ", TRUE, UNIQUE(FILTER(interactions!H:H, interactions!D:D = D2)))

  8. Get the user’s preferred language from the Users table

    Use this formula to fetch preferred language based on user ID:

    =TEXTJOIN(", ",TRUE,UNIQUE(FILTER(users!J:J,users!F:F=F3)))


Additional Notes:

Fields like User Department, User Location, and User Country are not directly available from the Data API. These attributes must be sourced from your organization’s external identity systems.

The User API provides an external ID which can be used to enrich user metadata by integrating with external identity providers such as Okta. You can refer to Okta List Users for details. Once this data is pulled from the external system, join it using the external ID to populate the missing attributes.


While the modifications shown here are demonstrated in Excel, we are also providing a Python script that can directly generate a raw interactions table based on the time input specified in the script

To run a Python file, first install Python from python.org. On Mac, open Terminal, navigate to your file’s folder, and run python3 script.py. On Windows, open Command Prompt, go to the file’s folder, and run python script.py.

1import requests
2import pandas as pd
3import time
4import logging
5from pathlib import Path
6import re
7from datetime import datetime, timedelta
8
9# Config
10BASE_API = 'https://api.moveworks.ai/export/v1beta2/records'
11ENDPOINTS = [
12 'interactions',
13 'conversations',
14 'plugin-calls',
15 'plugin-resources',
16 'users'
17]
18ACCESS_TOKEN = '<Access Token>' # Replace this
19
20# Time filter will be set dynamically based on user input
21
22# Setup logging
23logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
25# Headers
26headers = {
27 'Authorization': f'Bearer {ACCESS_TOKEN}',
28}
29
30# Retry config
31MAX_RETRIES = 5
32
33def get_user_date_inputs():
34 """Get start and end dates from user input and convert to timestamp format"""
35 print("\n" + "="*50)
36 print("RAW INTERACTIONS TABLE BUILDER")
37 print("="*50)
38 print("Enter the date range for data extraction:")
39 print("Date format: YYYY-MM-DD (e.g., 2025-07-15)")
40 print("-"*50)
41
42 while True:
43 try:
44 start_date_str = input("Enter start date (YYYY-MM-DD): ").strip()
45 start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
46 break
47 except ValueError:
48 print("❌ Invalid date format. Please use YYYY-MM-DD format (e.g., 2025-07-15)")
49
50 while True:
51 try:
52 end_date_str = input("Enter end date (YYYY-MM-DD): ").strip()
53 end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
54
55 if end_date < start_date:
56 print("❌ End date cannot be earlier than start date. Please try again.")
57 continue
58 break
59 except ValueError:
60 print("❌ Invalid date format. Please use YYYY-MM-DD format (e.g., 2025-07-15)")
61
62 # Convert to timestamp format for API
63 start_timestamp = start_date.strftime("%Y-%m-%dT00:00:00.000Z")
64 # For end date, set to end of day
65 end_timestamp = (end_date + timedelta(days=1) - timedelta(microseconds=1)).strftime("%Y-%m-%dT23:59:59.999Z")
66
67 time_filter = f"last_updated_time ge '{start_timestamp}' and last_updated_time le '{end_timestamp}'"
68
69 print(f"\n✅ Date range set:")
70 print(f" Start: {start_date_str} 00:00:00 UTC")
71 print(f" End: {end_date_str} 23:59:59 UTC")
72 print(f" Filter: {time_filter}")
73 print("-"*50)
74
75 return time_filter, start_date_str, end_date_str
76
77def clean_illegal_chars(df):
78 """Clean illegal characters from DataFrame"""
79 def remove_illegal_chars(value):
80 if isinstance(value, str):
81 return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', value)
82 return value
83 return df.applymap(remove_illegal_chars)
84
85def flatten_record(record):
86 """Flatten nested fields in record"""
87 flat = record.copy()
88 if isinstance(flat.get("detail"), dict):
89 detail = flat.pop("detail", {}) or {}
90 for key, value in detail.items():
91 if isinstance(value, list):
92 value = ','.join(map(str, value))
93 flat[f'detail_{key}'] = value
94 return flat
95
96def fetch_data(endpoint, time_filter): # Added time_filter parameter
97 """Fetch data from a specific endpoint"""
98 url = f'{BASE_API}/{endpoint}'
99 params = {
100 '$orderby': 'id desc',
101 '$filter': time_filter, # Using the passed time_filter
102 }
103
104 data = []
105 retries = 0
106 page_count = 0
107
108 logging.info(f"Fetching data from /{endpoint}")
109
110 while url:
111 try:
112 logging.info(f"Requesting page {page_count + 1}: {url}")
113 response = requests.get(url, headers=headers, params=params if url.endswith(endpoint) else None)
114
115 if response.status_code == 200:
116 json_resp = response.json()
117 page_data = json_resp.get('value', [])
118 logging.info(f"Retrieved {len(page_data)} records from page {page_count + 1}")
119
120 if not page_data:
121 break
122
123 for record in page_data:
124 data.append(flatten_record(record))
125
126 url = json_resp.get('@odata.nextLink')
127 retries = 0
128 page_count += 1
129
130 if endpoint == 'users':
131 time.sleep(2)
132
133 elif response.status_code == 429:
134 wait = 90 if endpoint == 'users' else 60
135 logging.warning(f"Rate limited on /{endpoint}. Waiting for {wait} seconds.")
136 time.sleep(wait)
137
138 elif response.status_code in (500, 502, 503, 504):
139 if retries < MAX_RETRIES:
140 wait = 2 ** retries
141 logging.warning(f"Transient error {response.status_code} on /{endpoint}. Retrying in {wait} seconds...")
142 time.sleep(wait)
143 retries += 1
144 else:
145 logging.error(f"Max retries reached on /{endpoint}. Aborting.")
146 break
147
148 else:
149 logging.error(f"Unexpected error {response.status_code} on /{endpoint}: {response.text}")
150 break
151
152 except Exception as e:
153 logging.exception(f"Exception occurred while calling /{endpoint}")
154 break
155
156 if data:
157 df = pd.DataFrame(data)
158 df = clean_illegal_chars(df)
159 logging.info(f"Ingestion complete for /{endpoint}, {len(df)} records.")
160 return df
161 else:
162 logging.info(f"No data retrieved from /{endpoint}.")
163 return pd.DataFrame()
164
165def get_conversation_topics(interactions_df, conversation_id):
166 """Get all entities for a conversation ID"""
167 entities = interactions_df[interactions_df['conversation_id'] == conversation_id]['detail_entity'].dropna().unique()
168 return ', '.join([str(e) for e in entities if str(e) != 'nan'])
169
170def get_interaction_content(row, interactions_df):
171 """Extract interaction content from detail fields"""
172 interaction_id = row['id']
173 interaction_row = interactions_df[interactions_df['id'] == interaction_id]
174
175 if interaction_row.empty:
176 return ""
177
178 interaction_row = interaction_row.iloc[0]
179
180 # Based on API example, only use fields that actually exist
181 # From the example: detail.content is the main field for user message content
182 if 'detail_content' in interaction_row and pd.notna(interaction_row['detail_content']) and str(interaction_row['detail_content']) != '':
183 return str(interaction_row['detail_content'])
184
185 return ""
186
187def get_bot_response(row, interactions_df):
188 """Get bot response using parent interaction ID"""
189 interaction_id = row['id']
190
191 # Find bot interactions with this interaction as parent
192 bot_response = interactions_df[
193 (interactions_df['parent_interaction_id'] == interaction_id) &
194 (interactions_df['actor'] == 'bot')
195 ]
196
197 if not bot_response.empty:
198 bot_row = bot_response.iloc[0]
199 # Only use the content field that actually exists in the API
200 if 'detail_content' in bot_row and pd.notna(bot_row['detail_content']) and str(bot_row['detail_content']) != '':
201 return str(bot_row['detail_content'])
202
203 return ""
204
205def get_plugin_info(interaction_id, plugin_calls_df, plugin_type):
206 """Get plugin information based on type (unsuccessful, served, used)"""
207 if plugin_calls_df.empty:
208 return ""
209
210 interaction_plugins = plugin_calls_df[plugin_calls_df['interaction_id'] == interaction_id]
211
212 if interaction_plugins.empty:
213 return ""
214
215 if plugin_type == 'unsuccessful':
216 # Plugins that were not served and not used
217 plugins = interaction_plugins[
218 (interaction_plugins['served'] == False) &
219 (interaction_plugins['used'] == False)
220 ]['plugin_name'].unique()
221 elif plugin_type == 'served':
222 # Plugins that were served
223 plugins = interaction_plugins[
224 interaction_plugins['served'] == True
225 ]['plugin_name'].unique()
226 elif plugin_type == 'used':
227 # Plugins that were served and used
228 plugins = interaction_plugins[
229 (interaction_plugins['served'] == True) &
230 (interaction_plugins['used'] == True)
231 ]['plugin_name'].unique()
232 else:
233 return ""
234
235 return ', '.join([str(p) for p in plugins if str(p) != 'nan'])
236
237def get_resource_info(interaction_id, plugin_resources_df, info_type):
238 """Get resource information (domain, citations, content items, tickets)"""
239 if plugin_resources_df.empty:
240 return "" if info_type != 'citation_count' else 0
241
242 interaction_resources = plugin_resources_df[plugin_resources_df['interaction_id'] == interaction_id]
243
244 if interaction_resources.empty:
245 return "" if info_type != 'citation_count' else 0
246
247 if info_type == 'domain':
248 domains = interaction_resources['detail_domain'].dropna().unique()
249 return ', '.join([str(d) for d in domains if str(d) != 'nan'])
250
251 elif info_type == 'citation_count':
252 unique_resources = interaction_resources['resource_id'].dropna().nunique()
253 return unique_resources
254
255 elif info_type == 'content_item_id':
256 # Use external_resource_id from detail field
257 content_ids = interaction_resources['detail_external_resource_id'].dropna().unique()
258 return ', '.join([str(c) for c in content_ids if str(c) != 'nan'])
259
260 elif info_type == 'content_item_name':
261 content_names = interaction_resources['detail_name'].dropna().unique()
262 return ', '.join([str(c) for c in content_names if str(c) != 'nan'])
263
264 elif info_type == 'ticket_type':
265 # Need to determine ticket type logic based on your business rules
266 # This might need adjustment based on how you determine user vs bot initiated
267 ticket_resources = interaction_resources[ticket_resources['type'] == 'RESOURCE_TYPE_TICKET']
268 if not ticket_resources.empty:
269 return "user initiated ticket" # Default, adjust logic as needed
270 return ""
271
272 elif info_type == 'ticket_id':
273 ticket_resources = interaction_resources[interaction_resources['type'] == 'RESOURCE_TYPE_TICKET']
274 ticket_ids = ticket_resources['detail_external_resource_id'].dropna().unique()
275 return ', '.join([str(t) for t in ticket_ids if str(t) != 'nan'])
276
277 return ""
278
279def get_interaction_surface(interaction_id, interactions_df):
280 """Get the platform/surface where interaction took place"""
281 interaction_row = interactions_df[interactions_df['id'] == interaction_id]
282
283 if not interaction_row.empty:
284 # Use 'platform' field from the API response
285 surface = interaction_row.iloc[0].get('platform', '')
286 return str(surface) if pd.notna(surface) else ""
287
288 return ""
289
290def get_user_language(user_id, users_df):
291 """Get user's preferred language"""
292 if users_df.empty:
293 return ""
294
295 # Use the actual field name from API response
296 user_row = users_df[users_df['id'] == user_id]
297
298 if not user_row.empty:
299 language = user_row.iloc[0].get('user_preferred_language', '')
300 return str(language) if pd.notna(language) else ""
301
302 return ""
303
304def build_raw_interactions_table(all_dataframes):
305 """Build the final raw interactions table"""
306 logging.info("Building raw interactions table...")
307
308 # Extract dataframes
309 interactions_df = all_dataframes.get('interactions', pd.DataFrame())
310 conversations_df = all_dataframes.get('conversations', pd.DataFrame())
311 plugin_calls_df = all_dataframes.get('plugin-calls', pd.DataFrame())
312 plugin_resources_df = all_dataframes.get('plugin-resources', pd.DataFrame())
313 users_df = all_dataframes.get('users', pd.DataFrame())
314
315 if interactions_df.empty:
316 logging.error("No interactions data available")
317 return pd.DataFrame()
318
319 # Filter for user-led interactions only
320 user_interactions = interactions_df[interactions_df['actor'] == 'user'].copy()
321
322 if user_interactions.empty:
323 logging.warning("No user interactions found")
324 return pd.DataFrame()
325
326 logging.info(f"Processing {len(user_interactions)} user interactions...")
327
328 # Initialize the raw interactions table
329 raw_interactions = pd.DataFrame()
330
331 # Base columns from interactions
332 raw_interactions['Timestamp'] = user_interactions['created_time']
333 raw_interactions['Conversation ID'] = user_interactions['conversation_id']
334 raw_interactions['Interaction ID'] = user_interactions['id']
335 raw_interactions['Interaction Type'] = user_interactions['type'] # Changed from 'interaction_type'
336
337 # Get conversation domain from conversations table
338 if not conversations_df.empty:
339 conversation_domain_map = conversations_df.set_index('id')['primary_domain'].to_dict() # Changed from 'domain'
340 raw_interactions['Conversation Domain'] = raw_interactions['Conversation ID'].map(conversation_domain_map)
341 else:
342 raw_interactions['Conversation Domain'] = ""
343
344 # Get conversation topics (aggregated for each conversation)
345 logging.info("Processing conversation entities...")
346 raw_interactions['Conversation Topic'] = raw_interactions['Conversation ID'].apply(
347 lambda x: get_conversation_topics(interactions_df, x)
348 )
349
350 # Get interaction content and bot response
351 logging.info("Processing interaction content and bot responses...")
352 raw_interactions['Interaction Content'] = user_interactions.apply(
353 lambda row: get_interaction_content(row, interactions_df), axis=1
354 )
355
356 raw_interactions['Bot Response'] = user_interactions.apply(
357 lambda row: get_bot_response(row, interactions_df), axis=1
358 )
359
360 # Get plugin information
361 logging.info("Processing plugin information...")
362 raw_interactions['Unsuccessful Plugins'] = raw_interactions['Interaction ID'].apply(
363 lambda x: get_plugin_info(x, plugin_calls_df, 'unsuccessful')
364 )
365
366 raw_interactions['Plugin Served'] = raw_interactions['Interaction ID'].apply(
367 lambda x: get_plugin_info(x, plugin_calls_df, 'served')
368 )
369
370 raw_interactions['Plugin Used'] = raw_interactions['Interaction ID'].apply(
371 lambda x: get_plugin_info(x, plugin_calls_df, 'used')
372 )
373
374 # Get resource information
375 logging.info("Processing resource information...")
376 raw_interactions['Resource Domain'] = raw_interactions['Interaction ID'].apply(
377 lambda x: get_resource_info(x, plugin_resources_df, 'domain')
378 )
379
380 raw_interactions['No of Citations'] = raw_interactions['Interaction ID'].apply(
381 lambda x: get_resource_info(x, plugin_resources_df, 'citation_count')
382 )
383
384 raw_interactions['Content Item Name'] = raw_interactions['Interaction ID'].apply(
385 lambda x: get_resource_info(x, plugin_resources_df, 'content_item_name')
386 )
387
388 raw_interactions['Content Item ID'] = raw_interactions['Interaction ID'].apply(
389 lambda x: get_resource_info(x, plugin_resources_df, 'content_item_id')
390 )
391
392 raw_interactions['Ticket Type'] = raw_interactions['Interaction ID'].apply(
393 lambda x: get_resource_info(x, plugin_resources_df, 'ticket_type')
394 )
395
396 raw_interactions['Ticket ID'] = raw_interactions['Interaction ID'].apply(
397 lambda x: get_resource_info(x, plugin_resources_df, 'ticket_id')
398 )
399
400 # Get interaction surface
401 raw_interactions['Interaction Surface'] = raw_interactions['Interaction ID'].apply(
402 lambda x: get_interaction_surface(x, interactions_df)
403 )
404
405 # Get user information
406 if 'user_id' in user_interactions.columns:
407 logging.info("Processing user information...")
408 raw_interactions['User Preferred Language'] = user_interactions['user_id'].apply(
409 lambda x: get_user_language(x, users_df)
410 )
411 else:
412 raw_interactions['User Preferred Language'] = ""
413
414 # Add placeholder columns for external user data
415 raw_interactions['User Department'] = "" # To be populated from external identity system
416 raw_interactions['User Location'] = "" # To be populated from external identity system
417 raw_interactions['User Country'] = "" # To be populated from external identity system
418
419 logging.info(f"Raw interactions table built with {len(raw_interactions)} rows and {len(raw_interactions.columns)} columns")
420
421 return raw_interactions
422
423def main():
424 """Main function to orchestrate the entire process"""
425 # Get user input for date range
426 time_filter, start_date_str, end_date_str = get_user_date_inputs()
427
428 logging.info("Starting raw interactions table builder...")
429
430 # Step 1: Fetch all data
431 all_dataframes = {}
432
433 for endpoint in ENDPOINTS:
434 df = fetch_data(endpoint, time_filter)
435 if not df.empty:
436 all_dataframes[endpoint] = df
437
438 if not all_dataframes:
439 logging.error("No data retrieved from any endpoint")
440 return
441
442 # Step 2: Build raw interactions table
443 raw_interactions_table = build_raw_interactions_table(all_dataframes)
444
445 if raw_interactions_table.empty:
446 logging.error("Failed to build raw interactions table")
447 return
448
449 # Step 3: Save results
450 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
451
452 # Save raw data to Excel (optional)
453 raw_data_file = Path(f"moveworks_raw_data_{start_date_str}_to_{end_date_str}_{timestamp}.xlsx")
454 with pd.ExcelWriter(raw_data_file, engine='openynxl') as writer:
455 for endpoint, df in all_dataframes.items():
456 sheet_name = endpoint[:31] # Excel sheet name max limit
457 df.to_excel(writer, sheet_name=sheet_name, index=False)
458 logging.info(f"Raw data saved to {raw_data_file}")
459
460 # Save final raw interactions table
461 final_file = Path(f"raw_interactions_table_{start_date_str}_to_{end_date_str}_{timestamp}.xlsx")
462 raw_interactions_table.to_excel(final_file, index=False, sheet_name='Raw Interactions')
463 logging.info(f"Final raw interactions table saved to {final_file}")
464
465 # Also save as CSV for easier processing
466 csv_file = Path(f"raw_interactions_table_{start_date_str}_to_{end_date_str}_{timestamp}.csv")
467 raw_interactions_table.to_csv(csv_file, index=False)
468 logging.info(f"Final raw interactions table saved to {csv_file}")
469
470 # Print summary
471 print("\n" + "="*50)
472 print("RAW INTERACTIONS TABLE SUMMARY")
473 print("="*50)
474 print(f"Total interactions processed: {len(raw_interactions_table)}")
475 print(f"Date range: {start_date_str} to {end_date_str}")
476 print(f"Time filter: {time_filter}")
477 print(f"Columns: {list(raw_interactions_table.columns)}")
478 print(f"Files generated:")
479 print(f" - {raw_data_file} (raw data)")
480 print(f" - {final_file} (final table - Excel)")
481 print(f" - {csv_file} (final table - CSV)")
482
483 # Show sample data
484 if len(raw_interactions_table) > 0:
485 print(f"\nSample data (first 3 rows):")
486 print(raw_interactions_table.head(3).to_string())
487
488if __name__ == "__main__":
489 main()