import pandas as pd import requests import os import shutil from datetime import datetime, time from ta.trend import EMAIndicator from ta.momentum import StochasticOscillator class DataEngine: def __init__(self, symbol=None, url=None, provider=None, data_dir='data_cache'): self.symbol = symbol self.url = url self.provider = provider # Use your robust path logic base_path = os.path.dirname(os.path.abspath(__file__)) self.cache_dir = os.path.join(base_path, data_dir) # Use data_dir variable os.makedirs(self.cache_dir, exist_ok=True) # 4. Only set file_path if we actually have a symbol if self.symbol: self.file_path = os.path.join(self.cache_dir, f"{self.symbol}.csv") else: self.file_path = None def load_instruments_from_csv(self, file_path): import csv instruments = [] # Updated templates for maximum historical reach TEMPLATES = { 'jpm': "https://am.jpmorgan.com/FundsMarketingHandler/historicalData?cusip={cusip}&country=hk&role=per", # period1=0 fetches from the earliest available date; interval=1d is daily 'yahoo': "https://query1.finance.yahoo.com/v8/finance/chart/{cusip}?period1=0&period2=9999999999&interval=1d&events=history", # FT remains 30-day window; Smart Append logic in fetch_data handles the history 'agi': "https://markets.ft.com/data/funds/tearsheet/historical?s={cusip}" } try: abs_path = os.path.join(os.path.dirname(__file__), file_path) if not os.path.exists(abs_path): print(f"Error: {file_path} not found.") return [] with open(abs_path, mode='r', encoding='utf-8-sig') as csvfile: reader = csv.DictReader(csvfile) reader.fieldnames = [name.strip().lower() for name in reader.fieldnames] for row in reader: symbol = row.get('symbol', '').strip() cusip = row.get('cusip', '').strip() provider = row.get('provider', 'jpm').strip().lower() if symbol and cusip: template = TEMPLATES.get(provider, TEMPLATES['jpm']) url = template.format(cusip=cusip) instruments.append({ "symbol": symbol, "url": url, "provider": provider, "cusip": cusip # Added this so sync_all can use it if needed }) except Exception as e: print(f"CSV Loading Error: {e}") return instruments # URL_CONFIG = load_instruments_from_csv('instruments.csv') def global_sync(self): """Backup, Sync all instruments, and return a summary report.""" # 1. Run Maintenance/Backup self.run_pre_sync_maintenance() # FIX 1: Add 'self.' so it calls the method inside this class instruments = self.load_instruments_from_csv('instruments.csv') report = { "total": len(instruments), "updated": 0, "failed": 0, "details": [] } for item in instruments: try: self.symbol = item['symbol'] self.provider = item['provider'] self.url = item['url'] # FIX 2: Use 'self.cache_dir' to match your __init__ logic self.file_path = os.path.join(self.cache_dir, f"{self.symbol}.csv") print(f"Updating {self.symbol}...") # fetch_data now returns the updated DataFrame or None result_df = self.fetch_data() time.sleep(1) if result_df is not None and not result_df.empty: report["updated"] += 1 last_price = result_df['close'].iloc[-1] report["details"].append(f"✅ {self.symbol}: Updated (Price: {last_price})") else: report["failed"] += 1 report["details"].append(f"❌ {self.symbol}: No new data found") except Exception as e: report["failed"] += 1 report["details"].append(f"⚠️ {self.symbol}: Error ({str(e)})") return report def run_pre_sync_maintenance(self): """Backs up files and reports current data health.""" import os import shutil import pandas as pd from datetime import datetime # 1. Setup paths correctly base_dir = os.path.dirname(os.path.abspath(__file__)) backup_dir = os.path.join(base_dir, 'backups') # 2. Create the timestamped folder path FIRST timestamp = datetime.now().strftime("%Y%m%d_%H%M") current_backup_path = os.path.join(backup_dir, f"sync_backup_{timestamp}") # 3. Create the directories (safety-first) os.makedirs(current_backup_path, exist_ok=True) print(f"\n--- Pre-Sync Health Check ({timestamp}) ---") stats = [] # 4. Check if cache exists to avoid errors if not os.path.exists(self.cache_dir): print(f"⚠️ Cache directory not found at {self.cache_dir}") return pd.DataFrame() # 5. Backup loop for filename in os.listdir(self.cache_dir): if filename.endswith(".csv"): src = os.path.join(self.cache_dir, filename) dst = os.path.join(current_backup_path, filename) try: # Perform copy shutil.copy2(src, dst) # Read data for health check df = pd.read_csv(src) # Store stats stats.append({ "Fund": filename.replace(".csv", ""), "Rows": len(df), "Start": df['date'].min() if 'date' in df.columns else "N/A", "End": df['date'].max() if 'date' in df.columns else "N/A" }) print(f"📦 Backed up: {filename} ({len(df)} rows)") except Exception as e: print(f"⚠️ Could not backup {filename}: {e}") continue # 6. Display and return report if stats: stats_df = pd.DataFrame(stats) print("\n" + stats_df.to_string(index=False)) print(f"\n✅ All backups saved to: {current_backup_path}") return stats_df else: print("📭 No CSV files found to backup.") return pd.DataFrame() def _parse_jpm(self, json_data): if isinstance(json_data, dict) and "historicalNAVList" in json_data: df = pd.DataFrame(json_data["historicalNAVList"]) return df.rename(columns={'navPrice': 'close', 'date': 'date'}) return None def _parse_ft_html(self, html_text): try: # 1. Use BeautifulSoup to handle the nested spans in the Date column from bs4 import BeautifulSoup soup = BeautifulSoup(html_text, 'html.parser') # Find the specific results table table = soup.find('table', class_='mod-tearsheet-historical-prices__results') if not table: print(f"❌ Could not find the results table in the HTML for {self.symbol}") return None data = [] rows = table.find('tbody').find_all('tr') for row in rows: cols = row.find_all('td') if len(cols) >= 5: # The Date cell has two spans. We'll take the first one (Full date). date_cell = cols[0].find('span', class_='mod-ui-hide-small-below') date_str = date_cell.get_text(strip=True) if date_cell else cols[0].get_text(strip=True) # The Close price is usually the 5th column (index 4) close_str = cols[4].get_text(strip=True).replace(',', '') data.append({ 'date': date_str, 'close': close_str }) # 2. Convert to DataFrame df = pd.DataFrame(data) if df.empty: return None # 3. Final Type Conversion df['date'] = pd.to_datetime(df['date'], errors='coerce') df['close'] = pd.to_numeric(df['close'], errors='coerce') return df.dropna().sort_values('date').reset_index(drop=True) except Exception as e: print(f"❌ Failed to parse FT HTML structure: {e}") return None def _parse_yahoo(self, json_data): """Parses Yahoo Finance v8 Chart JSON""" try: chart = json_data['chart']['result'][0] timestamps = chart['timestamp'] indicators = chart['indicators']['quote'][0] # Use adjclose if available, otherwise close closes = indicators.get('close', []) df = pd.DataFrame({ 'date': pd.to_datetime(timestamps, unit='s'), 'close': closes }) return df except: return None def fetch_data(self): local_df = pd.DataFrame() new_df = None # 1. Load Local Cache & Force Date Type if os.path.exists(self.file_path): try: local_df = pd.read_csv(self.file_path) local_df = local_df.loc[:, ~local_df.columns.duplicated()].copy() local_df.columns = [c.lower().strip() for c in local_df.columns] local_df = local_df.rename(columns={'price': 'close', 'nav': 'close'}) # FORCE CONVERSION: This fixes the '<' error # errors='coerce' turns bad text into NaT (Not a Time), which we then drop local_df['date'] = pd.to_datetime(local_df['date'], errors='coerce') local_df = local_df.dropna(subset=['date']).reset_index(drop=True) except Exception as e: print(f"Local Load Error: {e}") # 2. Network Fetch try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'} response = requests.get(self.url, headers=headers, timeout=15) response.raise_for_status() if self.provider == 'agi': new_df = self._parse_ft_html(response.text) elif self.provider == 'jpm': new_df = self._parse_jpm(response.json()) elif self.provider == 'yahoo': new_df = self._parse_yahoo(response.json()) # 3. Safe Merge & Sort if new_df is not None and not new_df.empty: # Force new_df dates to match local_df format new_df['date'] = pd.to_datetime(new_df['date'], errors='coerce') combined_df = pd.concat([local_df, new_df], ignore_index=True) combined_df = combined_df.drop_duplicates(subset=['date'], keep='last') # SORTING: Now safe because all types are Timestamps combined_df = combined_df.sort_values('date').reset_index(drop=True) if 'close' in combined_df.columns: final_df = combined_df[['date', 'close']].dropna() final_df.to_csv(self.file_path, index=False) return final_df return local_df except Exception as e: print(f"Network error for {self.symbol}: {e}") return local_df def get_local_metrics(self): """Reads ONLY from local CSV and returns metrics immediately.""" if not os.path.exists(self.file_path): return {"error": "Missing Local Data", "status": "needs_sync"} try: df = pd.read_csv(self.file_path) # Ensure columns are clean df.columns = [c.lower().strip() for c in df.columns] df['date'] = pd.to_datetime(df['date'], errors='coerce') df = df.dropna(subset=['date', 'close']).sort_values('date') # Pass this local dataframe to your existing calculation function return self.calculate_table_metrics(df) except Exception as e: print(f"Error reading local data for {self.symbol}: {e}") return None def calculate_table_metrics(self, df): if df is None or df.empty or len(df) < 2: return None last_close = float(df.iloc[-1]['close']) prev_close = float(df.iloc[-2]['close']) change_pct = ((last_close - prev_close) / prev_close) * 100 count = len(df) def get_ema_offset(window): if count >= window: ema = EMAIndicator(close=df['close'], window=window).ema_indicator().iloc[-1] return round(((last_close / ema) * 100) - 100, 1) return "N/A" k_val = d_val = "N/A" if count >= 14: high_14 = df['close'].rolling(window=14).max() low_14 = df['close'].rolling(window=14).min() stoch = StochasticOscillator(high=high_14, low=low_14, close=df['close'], window=14) k_val = round(stoch.stoch().iloc[-1], 0) d_val = round(stoch.stoch_signal().iloc[-1], 0) return { "last_close": round(last_close, 2), "change_pct": round(change_pct, 2), "low_52": round(float(df.tail(252)['close'].min()), 2), "high_52": round(float(df.tail(252)['close'].max()), 2), "last_ema20": get_ema_offset(20), "last_ema50": get_ema_offset(50), "last_ema100": get_ema_offset(100), "last_ema200": get_ema_offset(200), "kd_values": f"{k_val}/{d_val}" if k_val != "N/A" else "N/A" }