From dfda9eac9a82c307d4b2448886ae69fe4cd1cb8c Mon Sep 17 00:00:00 2001 From: daniel muniz Date: Tue, 14 Apr 2026 09:00:25 -0300 Subject: [PATCH] implemented detailed endpoint --- claude.md | 276 ++++++++++++++++++ makefile | 8 + playground.js | 4 +- src/server.js | 34 ++- .../parse-detailed-category-use-case.js | 130 +++++++++ test-detailed.http | 49 ++++ 6 files changed, 492 insertions(+), 9 deletions(-) create mode 100644 claude.md create mode 100644 makefile create mode 100644 src/use-cases/parse-detailed-category-use-case.js create mode 100644 test-detailed.http diff --git a/claude.md b/claude.md new file mode 100644 index 0000000..9e71ff9 --- /dev/null +++ b/claude.md @@ -0,0 +1,276 @@ +# Claude Development Guide - NetStar Categorizer + +## Project Overview + +**NetStar Categorizer** is a Node.js microservice that provides domain name/FQDN categorization using the inCompass NetStar SDK. It exposes both UDP and HTTP interfaces for real-time content classification with automatic daily database updates. + +### Core Purpose +- Categorize domains into standardized categories using NetStar's content database +- Map raw NetStar category IDs to organization-specific category codes +- Provide both simple and detailed categorization results with reputation/age ratings +- Maintain updated categorization databases through automated cron jobs + +## Key Architecture + +### Technology Stack +- **Runtime**: Node.js v14+ +- **Web Framework**: Express.js v4 +- **External SDK**: inCompass NetStar v3.1.0-2 (C++ library for categorization) +- **Containerization**: Docker + Kubernetes +- **CI/CD**: CircleCI +- **Infrastructure**: Cloud-agnostic (DigitalOcean, Google Cloud, etc.) + +### Core Services +1. **HTTP Server** (Port 3333) - REST API for categorization requests +2. **UDP Server** (Port 33333) - Legacy UDP interface for categorization +3. **Cron Service** - Daily database updates via Kubernetes CronJob +4. **Category Mapping** - NetStar ID → Organization Code conversion (singleton pattern) + +## Project Structure + +``` +src/ +├── server.js # Entry point: initializes UDP + HTTP servers +├── app.js # Core logic: orchestrates categorization flow +├── client.js # UDP test client +├── cron.js # Scheduled database update logic +├── use-cases/ +│ ├── get-category-use-case.js # Executes NetStar gcf1check command +│ ├── category-converter-use-case.js # Maps NetStar IDs → org codes (singleton) +│ ├── parse-detailed-category-use-case.js # Parses detailed output with ratings +│ └── update-categories-use-case.js # Updates NetStar databases +└── etc/ + └── categories-mapping.json # Mapping table: NetStar ID → Zvelo codes + +deployment/ +├── deployment.yaml # Kubernetes deployment manifest +└── staging/deployment.yaml # Staging-specific config + +.circleci/config.yml # CI/CD pipeline (builds, tests, deploys) +Dockerfile # Container build specification +package.json # Node dependencies + scripts +makefile # Convenience commands +test-detailed.http # HTTP endpoint test file +``` + +## Language Standards + +### Comments and Error Messages - ENGLISH ONLY +**All code comments, error messages, log statements, and documentation must be written in English.** + +This includes: +- ✅ Code comments explaining logic +- ✅ Error messages and exception messages +- ✅ Console.log, console.error, and logging statements +- ✅ Variable and function names +- ✅ Commit messages +- ✅ Code review feedback +- ✅ Documentation strings (JSDoc, etc.) + +**Examples:** +```javascript +// Correct: English comment +function mapCategoryId(netstarId) { + if (!netstarId) { + throw new Error('NetStar ID is required') + } + // Map to organization category code + return categoryConverter.convert(netstarId) +} + +// Incorrect: Portuguese comment +function mapCategoryId(netstarId) { + if (!netstarId) { + throw new Error('ID do NetStar é obrigatório') // ❌ ERROR IN PORTUGUESE + } + // Mapear para código de categoria da organização // ❌ COMMENT IN PORTUGUESE + return categoryConverter.convert(netstarId) +} +``` + +## Coding Conventions + +### Code Style +- Use **ES6 syntax** (const/let, arrow functions, template literals) +- **No semicolons** in new code (already established pattern) +- Functional/modular design - keep files focused on single responsibility +- Use **singleton pattern** for shared state (see: `CategoryConverterUseCase`) +- **All comments in English** - see Language Standards section + +### Use Case Pattern +- Each business operation gets a dedicated use case class in `src/use-cases/` +- Use case classes should have a clear, single responsibility +- Example: + ```javascript + class GetCategoryUseCase { + async execute(fqdn) { + // implementation + } + } + ``` + +### Environment Variables +- Defined in `.env` (create from `.env.example`) +- `UDP_PORT=33333` - UDP server listen port +- `HTTP_PORT=3333` - HTTP server listen port + +## HTTP API Endpoints + +### `POST /` - Basic Categorization +- **Input**: `{"fqdn": "example.com"}` +- **Output**: `{"result": [10009, 10010]}` (array of category IDs) +- **Use Case**: Quick lookups when detailed info not needed + +### `POST /detailed` - Detailed Categorization +- **Input**: `{"fqdn": "example.com"}` +- **Output**: Full category info with reputation score, age rating, primary/secondary categories, and human-readable names +- **Use Case**: Comprehensive categorization for security decisions + +### UDP Server (Port 33333) +- **Input**: Raw domain string (e.g., `"example.com"`) +- **Output**: JSON-formatted result (same as HTTP `/` endpoint) +- **Legacy Interface**: Maintained for backward compatibility + +## Development Workflow + +### Setup & Running Locally +```bash +# Install dependencies +npm install + +# Development with auto-reload +npm run dev:server # Watch mode for server changes +npm run dev:client # Run UDP client for testing + +# Production +npm start # Start both servers + +# NetStar service commands (Linux system) +make gcf1-start # Start NetStar service +make gcf1-download # Download category databases +make gcf1-update # Update category databases +``` + +### Testing APIs +Use `test-detailed.http` in VS Code REST Client extension: +1. Open the file +2. Click "Send Request" on each endpoint +3. View responses in the side panel + +### Git Workflow +- **Main Branch**: `main` - production stable code +- **Development Branch**: `development` - feature integration +- **Feature Branches**: Create from `development`, merge back via PR +- Recent commits show HTTP approach implementation and cron job additions +- **Commit Messages**: Must be in English + +## Deployment + +### Docker +- **Base Image**: Ubuntu 22.04 +- **Includes**: Boost libraries, Node.js, NetStar SDK +- **Exposes**: Port 3000 (UDP) +- Build: `docker build -t netstar-categorizer .` + +### Kubernetes (Production) +- **Namespace**: `blackdice` +- **Deployment**: Single replica in appropriate cluster +- **CronJob**: Daily database updates at 00:00 UTC +- **Ingress**: `netstar-cat-dev.blackdice.ai` (DNS varies by environment) +- **Branches to Environments**: + - `development` → development cluster + - `qa` → QA cluster + - `staging` → staging cluster + - `production` → production cluster + - `gke-staging`, `gke-pov` → specific GKE clusters + +### CI/CD Pipeline (CircleCI) +- Automatically builds Docker image on push +- Tags image with commit SHA +- Deploys to appropriate Kubernetes cluster based on branch +- Release deployments via git tags + +## Key Technical Details + +### Category Mapping System +- **Source**: NetStar SDK returns numeric category IDs (e.g., 101, 102) +- **Mapping File**: `src/etc/categories-mapping.json` +- **Target**: Maps to organization's Zvelo pattern codes (e.g., 10075, 10078) +- **Singleton Implementation**: `CategoryConverterUseCase` maintains single instance across app +- **Example Mapping**: + - NetStar 101 (Illegal Activities) → Zvelo 10075 + - NetStar 201 (Terrorism/Extremists) → Zvelo 10018 + +### NetStar SDK Integration +- **Command**: `gcf1check` - queries the NetStar database for domain categorization +- **Child Process**: Executed via Node.js `child_process` module +- **Output Parsing**: Raw output parsed into JSON structure +- **Detailed Mode**: Includes reputation scores and age ratings in output + +### Automatic Database Updates +- **Mechanism**: Kubernetes CronJob at 0 0 * * * (daily at midnight UTC) +- **Fallback**: Manual update via `make gcf1-update` +- **Purpose**: Keeps categorization database current with latest NetStar classifications + +## Common Tasks + +### Adding a New Endpoint +1. Create a corresponding use case in `src/use-cases/` +2. Add route in `src/app.js` that calls the use case +3. Export and test in `test-detailed.http` +4. Update this guide if it's a significant feature +5. Ensure all error messages and comments are in English + +### Updating Category Mappings +1. Modify `src/etc/categories-mapping.json` with new ID mappings +2. Restart the service (singleton will reload on next request) +3. Test with both HTTP and UDP interfaces + +### Debugging +- **Server Logs**: Check Docker/Kubernetes logs for errors +- **Cron Logs**: View Kubernetes CronJob logs for database update issues +- **UDP Testing**: Use `npm run dev:client` to test directly +- **HTTP Testing**: Use `test-detailed.http` with VS Code REST Client +- **Error Messages**: All error logs must be in English + +### Troubleshooting +- **NetStar Service Not Running**: Run `make gcf1-start` +- **Stale Categories**: Manually run `make gcf1-update` or wait for cron job +- **Port Conflicts**: Ensure ports 3333 (HTTP) and 33333 (UDP) are available +- **Docker Build Issues**: Check that Boost C++ libraries are installed correctly + +## Current Development Status + +### Recent Work +- ✅ HTTP server implementation (alongside UDP) +- ✅ Detailed categorization with reputation/age ratings +- ✅ Cron job for automated daily updates +- ✅ Singleton category converter pattern +- 🔄 Work in Progress: + - `playground.js` - experimental/testing code + - `parse-detailed-category-use-case.js` - new detailed parsing feature + - Enhanced `server.js` - expanded server capabilities + +### Known Modified Files +- `playground.js` - development/testing (can be cleaned up) +- `src/server.js` - recent enhancements +- `makefile` - new convenience commands +- `test-detailed.http` - expanded test coverage + +## Guidelines for Contributions + +1. **Follow Existing Patterns**: Use use-case classes, follow module structure +2. **Test Before Committing**: Use `test-detailed.http` for API changes +3. **Update Mappings Properly**: Edit `categories-mapping.json`, not hardcode values +4. **Document Breaking Changes**: Update this guide if architecture changes +5. **Keep CircleCI Happy**: Ensure Docker build succeeds and K8s deployment configs are valid +6. **Don't Skip Steps**: Always test UDP and HTTP interfaces for categorization changes +7. **Language Standards**: All comments, error messages, and logs must be in English + +## Resources & External Documentation + +- **NetStar SDK**: Installed in Docker, documentation in inCompass SDK v3.1.0-2 +- **Express.js**: https://expressjs.com +- **Node.js Child Process**: https://nodejs.org/api/child_process.html +- **Kubernetes**: https://kubernetes.io/docs +- **CircleCI**: Configuration at `.circleci/config.yml` diff --git a/makefile b/makefile new file mode 100644 index 0000000..7b893c5 --- /dev/null +++ b/makefile @@ -0,0 +1,8 @@ +gcf1-start: + cd /usr/local/gcf1 && sbin/gcf1 start + +gcf1-download: + cd /usr/local/gcf1 && bin/gcf1dbmng.sh etc urldb_download + +gcf1-update: + cd /usr/local/gcf1 && bin/gcf1dbmng.sh etc urldb_update \ No newline at end of file diff --git a/playground.js b/playground.js index 839952a..1ff4ac1 100644 --- a/playground.js +++ b/playground.js @@ -7,12 +7,12 @@ const { spawn, exec } = require("node:child_process") // }) -exec("echo '99999.incompass.netstar-inc.com' | bin/gcf1check.sh etc check_categorize_hybrid", { cwd: '/usr/local/gcf1' }, (error, stdout, stderr)=>{ +exec("echo 'li12.pages.dev' | bin/gcf1check.sh etc check_categorize_hybrid", { cwd: '/usr/local/gcf1' }, (error, stdout, stderr)=>{ if (error) { console.error(error) return } - console.log(stdout.trim()) + console.log(stdout.trim().split(/\t|\s{2,}/)) }) \ No newline at end of file diff --git a/src/server.js b/src/server.js index 86a1fd0..c080de9 100644 --- a/src/server.js +++ b/src/server.js @@ -5,14 +5,21 @@ const app = require('./app') const bodyParser = require('body-parser') const cron = require('./cron') const express = require('express') +const { ParseDetailedCategoryUseCase } = require('./use-cases/parse-detailed-category-use-case') +const { CategoryConverterUseCase } = require('./use-cases/category-converter-use-case') +const categoriesMapping = require('./etc/categories-mapping.json') const httpServer = express() httpServer.use(bodyParser.json()) // for parsing application/json httpServer.use(bodyParser.urlencoded({ extended: true })) // for parsing application/x-www-form-urlencoded cron() -const UDP_PORT = process.env.UDP_PORT -const HTTP_PORT = process.env.HTTP_PORT +const UDP_PORT = process.env.UDP_PORT +const HTTP_PORT = process.env.HTTP_PORT + +// Initialize use cases +const categoryConverter = new CategoryConverterUseCase({ categoriesMapping }) +const parseDetailedCategory = new ParseDetailedCategoryUseCase({ categoryConverter }) const server = dgram.createSocket('udp4'); @@ -54,19 +61,32 @@ httpServer.post('/', async (req, res) => { const { fqdn } = req.body const categories = await app(fqdn) - + let result = {} - - if (categories) + + if (categories) result = { result: categories } - + res.status(200).json(JSON.stringify(result)) } catch (err) { res.status(500).json(err) } - + +}) + +httpServer.post('/detailed', async (req, res) => { + try { + const { fqdn } = req.body + + const detailedResult = await parseDetailedCategory.execute(fqdn) + + res.status(200).json(detailedResult) + } catch (err) { + console.error('Error in /detailed endpoint:', err) + res.status(500).json({ error: err.message || err }) + } }) httpServer.listen(HTTP_PORT, () => { diff --git a/src/use-cases/parse-detailed-category-use-case.js b/src/use-cases/parse-detailed-category-use-case.js new file mode 100644 index 0000000..1328ad3 --- /dev/null +++ b/src/use-cases/parse-detailed-category-use-case.js @@ -0,0 +1,130 @@ +const { exec } = require("node:child_process") + +class ParseDetailedCategoryUseCase { + constructor({ categoryConverter }) { + this.categoryConverter = categoryConverter; + } + + execute(domain) { + return new Promise((resolve, reject) => { + exec(`echo ${domain} | bin/gcf1check.sh etc check_categorize_hybrid`, + { cwd: '/usr/local/gcf1' }, + (error, stdout, stderr) => { + if (error) { + console.error(error); + reject(error); + return; + } + + try { + const parsed = this.parseOutput(stdout); + resolve(parsed); + } catch (parseError) { + console.error('Parse error:', parseError); + reject(parseError); + } + }); + }); + } + + parseOutput(output) { + // Extract all quoted strings in order + const quotedStrings = []; + const quoteRegex = /"([^"]*)"/g; + let match; + while ((match = quoteRegex.exec(output)) !== null) { + quotedStrings.push(match[1]); + } + + // Split on whitespace + const parts = output.trim().split(/\t|\s{2,}/); + + // Find numeric IDs by looking for numbers that appear in sequence + // After "Categorized", we have: count, primary_id, secondary_id, ..., reputation_score, ..., age_rating_score + + // Primary ID is parts[2] (always third element after "Categorized" and count) + const primary = parts[2]; + const primaryName = quotedStrings[0]; + + // Secondary ID is parts[4] (always fifth element) + const secondary = parts[4]; + + let secondaryName = ''; + let quotedIndex = 1; // Start after primary name + + // If secondary is not "0", it has a quoted name + if (secondary !== '0' && secondary !== '-') { + secondaryName = quotedStrings[quotedIndex]; + quotedIndex++; + } else { + secondaryName = parts[5] === '-' ? '-' : parts[5]; + } + + // Find reputation score - it's a single digit that comes after some markers + // Look for the pattern: a digit followed by a quoted reputation name + let reputation = ''; + let reputationName = ''; + + // Scan from parts[7] onwards to find reputation (it's usually around parts[8-11]) + for (let i = 7; i < parts.length; i++) { + const part = parts[i]; + // Look for single/double digit that's not a category ID + if (!isNaN(part) && part !== '-' && !part.includes('x') && !part.includes('|')) { + const num = parseInt(part); + // Reputation scores are typically 0-5, single digit + if (num >= 0 && num <= 5 && parts[i+1] !== '-' && !parts[i+1].includes('0x')) { + reputation = part; + reputationName = quotedStrings[quotedIndex]; + quotedIndex++; + break; + } + } + } + + // Find age rating - it's another single digit that comes after more markers + // After reputation, we should find age rating + let ageRating = ''; + let ageRatingName = ''; + + for (let i = 12; i < parts.length; i++) { + const part = parts[i]; + if (!isNaN(part) && part !== '-' && !part.includes('x') && !part.includes('|')) { + const num = parseInt(part); + if (num >= 0 && num <= 5) { + // Make sure it's not already used as reputation + if (part !== reputation || i > 10) { + ageRating = part; + ageRatingName = quotedStrings[quotedIndex]; + break; + } + } + } + } + + // Convert category IDs using the mapper + const primaryMapped = this.categoryConverter.execute(primary); + const secondaryMapped = secondary !== '-' && secondary !== '0' ? + this.categoryConverter.execute(secondary) : null; + + // Build result array in the same format as the / endpoint (as strings) + const resultArray = [String(primaryMapped)]; + if (secondaryMapped !== null) { + resultArray.push(String(secondaryMapped)); + } + + return { + result: resultArray, + primary: primaryMapped, + primary_name: primaryName, + secondary: secondaryMapped, + secondary_name: secondaryName, + reputation: parseInt(reputation), + reputation_name: reputationName, + age_rating: parseInt(ageRating), + age_rating_name: ageRatingName, + raw_output: output.trim() + }; + } +} + +module.exports = { ParseDetailedCategoryUseCase } diff --git a/test-detailed.http b/test-detailed.http new file mode 100644 index 0000000..a5553fa --- /dev/null +++ b/test-detailed.http @@ -0,0 +1,49 @@ +@baseUrl = http://localhost:3333 + +### Test detailed endpoint with Facebook +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "facebook.com" +} + +### Test detailed endpoint with Google +POST {{baseUrl}}/ +Content-Type: application/json + +{ + "fqdn": "tiktok.com" +} + +### Test detailed endpoint with Reddit +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "reddit.com" +} + +### Test detailed endpoint with YouTube +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "youtube.com" +} + +### Test detailed endpoint with Twitter +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "twitter.com" +} + +### Test simple endpoint with Facebook (original endpoint) +POST {{baseUrl}}/ +Content-Type: application/json + +{ + "fqdn": "facebook.com" +}