diff --git a/README.md b/README.md index 0b49c42..86e4505 100644 --- a/README.md +++ b/README.md @@ -50,20 +50,27 @@ HYBRID_NS_ACCESS=andkWEzbBWRR2K6iw4edRvfd6MmNNjRx HYBRID_USER_AGENT=CSUX_blackdice ``` -6. To start the service, run: +6. To start the service: ``` -sbin/gcf1 start -Starting gcf1d services: ...succeeded +cd /usr/local/gcf1/ + +./sbin/gcf1 start + +# Starting gcf1d services: ...succeeded ``` 7. To download/update databases, run: ``` -bin/gcf1dbmng.sh etc urldb_download -Success DB download . -bin/gcf1dbmng.sh etc urldb_update -Success DB update . +cd /usr/local/gcf1/ + +./bin/gcf1dbmng.sh etc urldb_download + +# Success DB download . + +./bin/gcf1dbmng.sh etc urldb_update +# Success DB update . ``` ## UDP Server Setup @@ -85,14 +92,14 @@ npm install ``` cp .env.example .env -PORT=3000 + +# --------- example contents --------- +# UDP_PORT=33333 +# HTTP_PORT=3333 +# ------------------------------------ ``` -## Running the Server - -To start the UDP server, use the following command: - -``` -npm start -``` -This will run the server and bind it to the port specified in your .env file. +## Digital Ocean Deployment details +1. The Node server is installed to `/opt/netstar-categorizer` +2. Rather than using `npm start` (which doesn't restart after a crash) the we use **[PM2](https://pm2.keymetrics.io/)**, which provides a more robust runtime +3. The full command to start the server using pm2 is `pm2 start npm --name "netstar-categorizer" --cwd /opt/netstar-categorizer/ -- start` - for convenience, there's a `run-me.sh` script in `/root` that will execute this command. diff --git a/claude.md b/claude.md index 9e71ff9..de56902 100644 --- a/claude.md +++ b/claude.md @@ -125,6 +125,20 @@ function mapCategoryId(netstarId) { - **Input**: `{"fqdn": "example.com"}` - **Output**: Full category info with reputation score, age rating, primary/secondary categories, and human-readable names - **Use Case**: Comprehensive categorization for security decisions +- **Includes**: `result` array matching the `/` endpoint format + +### `POST /full` - Complete Raw Categorization Output +- **Input**: `{"fqdn": "example.com"}` +- **Output**: Complete detailed structure with all 35 fields from NetStar including: + - Primary, secondary, and security categories (with IDs, names, and mapped IDs) + - Reputation score and name + - Matching flags and their descriptions + - Age rating score and name + - All 9 category group classifications (Internet/Infrastructure, Malware/Security, Dangerous/Harmful, Adult, Business/Government, Personal, Computing/Technology, Social Media, Miscellaneous) + - Volume index + - Submitted URL +- **Use Case**: Complete diagnostic and analysis when all categorization data is needed +- **Includes**: `result` array matching the `/` endpoint format ### UDP Server (Port 33333) - **Input**: Raw domain string (e.g., `"example.com"`) diff --git a/src/app.js b/src/app.js index 467c05b..b583650 100644 --- a/src/app.js +++ b/src/app.js @@ -7,10 +7,12 @@ const getCategory = new GetCategoryUseCase(); const categoryConverter = new CategoryConverterUseCase({categoriesMapping}); module.exports = async function app(domain){ + console.log(domain) const category = await getCategory.execute(domain) + console.log(category) const categoryConverted = await categoryConverter.execute(category) console.log(categoryConverted) - return categoryConverted + return [categoryConverted] } diff --git a/src/cron.js b/src/cron.js index ab71740..b94dcb9 100644 --- a/src/cron.js +++ b/src/cron.js @@ -3,8 +3,10 @@ const { UpdateCategoriesUseCase } = require('./use-cases/update-categories-use-c const updateCategories = new UpdateCategoriesUseCase() module.exports = async function cron() { + updateCategories.execute() + const oneDay = 1000 * 60 * 60 * 24 - setImmediate(() => { + setInterval(() => { updateCategories.execute() }, oneDay) } diff --git a/src/etc/categories-mapping.json b/src/etc/categories-mapping.json index 5b05a04..3768903 100644 --- a/src/etc/categories-mapping.json +++ b/src/etc/categories-mapping.json @@ -1109,7 +1109,7 @@ "id": "20103", "description": "Code Repositories", "related": [ - "10429" + "10047" ] }, { diff --git a/src/server.js b/src/server.js index c080de9..509384b 100644 --- a/src/server.js +++ b/src/server.js @@ -6,6 +6,7 @@ const bodyParser = require('body-parser') const cron = require('./cron') const express = require('express') const { ParseDetailedCategoryUseCase } = require('./use-cases/parse-detailed-category-use-case') +const { ParseFullCategoryUseCase } = require('./use-cases/parse-full-category-use-case') const { CategoryConverterUseCase } = require('./use-cases/category-converter-use-case') const categoriesMapping = require('./etc/categories-mapping.json') @@ -19,7 +20,8 @@ const HTTP_PORT = process.env.HTTP_PORT // Initialize use cases const categoryConverter = new CategoryConverterUseCase({ categoriesMapping }) -const parseDetailedCategory = new ParseDetailedCategoryUseCase({ categoryConverter }) +const parseDetailedCategory = new ParseDetailedCategoryUseCase({ categoryConverter }) +const parseFullCategory = new ParseFullCategoryUseCase({ categoryConverter }) const server = dgram.createSocket('udp4'); @@ -57,6 +59,7 @@ server.on('listening', () => { server.bind(UDP_PORT); httpServer.post('/', async (req, res) => { + console.log("new request") try { const { fqdn } = req.body @@ -89,8 +92,22 @@ httpServer.post('/detailed', async (req, res) => { } }) +httpServer.post('/full', async (req, res) => { + try { + const { fqdn } = req.body + + const fullResult = await parseFullCategory.execute(fqdn) + + res.status(200).json(fullResult) + } catch (err) { + console.error('Error in /full endpoint:', err) + res.status(500).json({ error: err.message || err }) + } +}) + httpServer.listen(HTTP_PORT, () => { console.log('HTTP server listening 3333') + console.log('UDP server listening 3334') }) // Response diff --git a/src/test-app.js b/src/test-app.js new file mode 100644 index 0000000..e14acbe --- /dev/null +++ b/src/test-app.js @@ -0,0 +1,10 @@ +const app = require('./app'); + +(async () => { + try { + const categories = await app('www.facebook.com'); + console.log(categories); + } catch (error) { + console.error('Error occurred:', error); + } +})(); \ No newline at end of file diff --git a/src/use-cases/category-converter-use-case.js b/src/use-cases/category-converter-use-case.js index 8a4d2af..9f61503 100644 --- a/src/use-cases/category-converter-use-case.js +++ b/src/use-cases/category-converter-use-case.js @@ -11,7 +11,7 @@ class CategoryConverterUseCase { execute(category) { const entry = this.categoriesMapping.find(item => item.id === category); - return entry ? entry.related[0].split(', ').map(str => str.trim()) : null; + return entry ? entry.related[0] : null; } } diff --git a/src/use-cases/get-category-use-case.js b/src/use-cases/get-category-use-case.js index ccbb022..f4105fd 100644 --- a/src/use-cases/get-category-use-case.js +++ b/src/use-cases/get-category-use-case.js @@ -10,11 +10,11 @@ class GetCategoryUseCase { return; } - const outputParts = stdout.split(/\s+/); - if (outputParts[3]) { - const categoryId = outputParts[2]; - console.log({ categoryId }); - resolve(categoryId); + const outputParts = stdout.split(/\t/); + if (outputParts[6] !== '0') { + resolve(outputParts[6]) + } else if (outputParts[3]) { + resolve(outputParts[2]); } else { console.log({ error: 'Category ID not found' }); reject('Category ID not found'); diff --git a/src/use-cases/parse-detailed-category-use-case.js b/src/use-cases/parse-detailed-category-use-case.js index 1328ad3..a4dcb20 100644 --- a/src/use-cases/parse-detailed-category-use-case.js +++ b/src/use-cases/parse-detailed-category-use-case.js @@ -36,77 +36,48 @@ class ParseDetailedCategoryUseCase { quotedStrings.push(match[1]); } - // Split on whitespace - const parts = output.trim().split(/\t|\s{2,}/); + // Split on tabs (primary separator in NetStar output) + const parts = output.trim().split('\t'); + let quotedIndex = 0; - // Find numeric IDs by looking for numbers that appear in sequence - // After "Categorized", we have: count, primary_id, secondary_id, ..., reputation_score, ..., age_rating_score + // Position mapping based on NetStar output structure: + // parts[0]: Categorized + // parts[1]: Matching Index + // parts[2]: Primary Category ID + // parts[3]: Primary Category Name (quoted) + // parts[4]: Secondary Category ID + // parts[5]: Secondary Category Name (quoted if secondary exists) + // parts[6]: Security Category ID + // parts[7]: Security Category Name + // parts[8]: Reputation Score ID + // parts[9]: Reputation Score Name (quoted) + // parts[10]: Matching Flag Value + // parts[11]: Matching Flag Names + // parts[12]: Age Rating ID + // parts[13]: Age Rating Name (quoted) - // Primary ID is parts[2] (always third element after "Categorized" and count) - const primary = parts[2]; - const primaryName = quotedStrings[0]; + const primaryId = parseInt(parts[2]); + const primaryName = quotedStrings[quotedIndex++]; + const primaryMapped = this.categoryConverter.execute(String(primaryId)); - // Secondary ID is parts[4] (always fifth element) - const secondary = parts[4]; + const secondaryId = parts[4] !== '0' && parts[4] !== '-' ? parseInt(parts[4]) : null; + let secondaryName = null; + let secondaryMapped = null; - let secondaryName = ''; - let quotedIndex = 1; // Start after primary name - - // If secondary is not "0", it has a quoted name - if (secondary !== '0' && secondary !== '-') { - secondaryName = quotedStrings[quotedIndex]; - quotedIndex++; + if (secondaryId !== null) { + secondaryName = quotedStrings[quotedIndex++]; + secondaryMapped = this.categoryConverter.execute(String(secondaryId)); } else { - secondaryName = parts[5] === '-' ? '-' : parts[5]; + secondaryName = parts[5] === '-' ? null : parts[5]; } - // Find reputation score - it's a single digit that comes after some markers - // Look for the pattern: a digit followed by a quoted reputation name - let reputation = ''; - let reputationName = ''; + const reputationId = parseInt(parts[8]); + const reputationName = quotedStrings[quotedIndex++]; - // Scan from parts[7] onwards to find reputation (it's usually around parts[8-11]) - for (let i = 7; i < parts.length; i++) { - const part = parts[i]; - // Look for single/double digit that's not a category ID - if (!isNaN(part) && part !== '-' && !part.includes('x') && !part.includes('|')) { - const num = parseInt(part); - // Reputation scores are typically 0-5, single digit - if (num >= 0 && num <= 5 && parts[i+1] !== '-' && !parts[i+1].includes('0x')) { - reputation = part; - reputationName = quotedStrings[quotedIndex]; - quotedIndex++; - break; - } - } - } + const ageRatingId = parseInt(parts[12]); + const ageRatingName = quotedStrings[quotedIndex++]; - // Find age rating - it's another single digit that comes after more markers - // After reputation, we should find age rating - let ageRating = ''; - let ageRatingName = ''; - - for (let i = 12; i < parts.length; i++) { - const part = parts[i]; - if (!isNaN(part) && part !== '-' && !part.includes('x') && !part.includes('|')) { - const num = parseInt(part); - if (num >= 0 && num <= 5) { - // Make sure it's not already used as reputation - if (part !== reputation || i > 10) { - ageRating = part; - ageRatingName = quotedStrings[quotedIndex]; - break; - } - } - } - } - - // Convert category IDs using the mapper - const primaryMapped = this.categoryConverter.execute(primary); - const secondaryMapped = secondary !== '-' && secondary !== '0' ? - this.categoryConverter.execute(secondary) : null; - - // Build result array in the same format as the / endpoint (as strings) + // Build result array in the same format as the / endpoint const resultArray = [String(primaryMapped)]; if (secondaryMapped !== null) { resultArray.push(String(secondaryMapped)); @@ -118,9 +89,9 @@ class ParseDetailedCategoryUseCase { primary_name: primaryName, secondary: secondaryMapped, secondary_name: secondaryName, - reputation: parseInt(reputation), + reputation: reputationId, reputation_name: reputationName, - age_rating: parseInt(ageRating), + age_rating: ageRatingId, age_rating_name: ageRatingName, raw_output: output.trim() }; diff --git a/src/use-cases/parse-full-category-use-case.js b/src/use-cases/parse-full-category-use-case.js new file mode 100644 index 0000000..00e5163 --- /dev/null +++ b/src/use-cases/parse-full-category-use-case.js @@ -0,0 +1,172 @@ +const { exec } = require("node:child_process") + +class ParseFullCategoryUseCase { + constructor({ categoryConverter }) { + this.categoryConverter = categoryConverter; + } + + execute(domain) { + return new Promise((resolve, reject) => { + exec(`echo ${domain} | bin/gcf1check.sh etc check_categorize_hybrid`, + { cwd: '/usr/local/gcf1' }, + (error, stdout, stderr) => { + if (error) { + console.error(error); + reject(error); + return; + } + + try { + const parsed = this.parseOutput(stdout); + resolve(parsed); + } catch (parseError) { + console.error('Parse error:', parseError); + reject(parseError); + } + }); + }); + } + + parseOutput(output) { + const quotedStrings = []; + const quoteRegex = /"([^"]*)"/g; + let match; + while ((match = quoteRegex.exec(output)) !== null) { + quotedStrings.push(match[1]); + } + + const parts = output.trim().split('\t'); + let quotedIndex = 0; + + // Extract primary category (parts[2], parts[3]) + const primaryId = parseInt(parts[2]); + const primaryName = quotedStrings[quotedIndex++]; + const primaryMapped = this.categoryConverter.execute(String(primaryId)); + + // Extract secondary category (parts[4], parts[5]) + const secondaryId = parts[4] !== '0' && parts[4] !== '-' ? parseInt(parts[4]) : null; + let secondaryName = null; + let secondaryMapped = null; + if (secondaryId !== null && parts[5] !== '-') { + secondaryName = quotedStrings[quotedIndex++]; + secondaryMapped = this.categoryConverter.execute(String(secondaryId)); + } else { + secondaryName = parts[5] === '-' ? null : parts[5]; + } + + // Extract security category (parts[6], parts[7]) + const securityId = parts[6] !== '0' && parts[6] !== '-' ? parseInt(parts[6]) : null; + let securityName = null; + let securityMapped = null; + if (securityId !== null && parts[7] !== '-') { + securityName = quotedStrings[quotedIndex++]; + securityMapped = this.categoryConverter.execute(String(securityId)); + } else { + securityName = parts[7] === '-' ? null : parts[7]; + } + + // Extract reputation (parts[8], parts[9]) + const reputationId = parseInt(parts[8]); + const reputationName = quotedStrings[quotedIndex++]; + + // Extract matching flag (parts[10], parts[11]) + const matchingFlagValue = parts[10]; + const matchingFlagNames = parts[11] ? parts[11].split('|') : []; + + // Extract age rating (parts[12], parts[13]) + const ageRatingId = parseInt(parts[12]); + const ageRatingName = quotedStrings[quotedIndex++]; + + // Extract category groups starting from parts[16] (after 2 empty fields) + // Each group has: id (number), name (quoted string) + // Pattern: 0 "Internet/Infrastructure" 0 "Malware/Security" ... + const categoryGroups = { + internet_infrastructure: { + id: parseInt(parts[16]) || 0, + name: quotedStrings[quotedIndex++] + }, + malware_security: { + id: parseInt(parts[18]) || 0, + name: quotedStrings[quotedIndex++] + }, + dangerous_harmful: { + id: parseInt(parts[20]) || 0, + name: quotedStrings[quotedIndex++] + }, + adult: { + id: parseInt(parts[22]) || 0, + name: quotedStrings[quotedIndex++] + }, + business_government: { + id: parseInt(parts[24]) || 0, + name: quotedStrings[quotedIndex++] + }, + personal: { + id: parseInt(parts[26]) || 0, + name: quotedStrings[quotedIndex++] + }, + computing_technology: { + id: parseInt(parts[28]) || 0, + name: quotedStrings[quotedIndex++] + }, + social_media: { + id: parseInt(parts[30]) || 0, + name: quotedStrings[quotedIndex++] + }, + miscellaneous: { + id: parseInt(parts[32]) || 0, + name: quotedStrings[quotedIndex++] + } + }; + + // Extract volume index and submitted URL + const volumeIndex = parts[35]; + const submittedUrl = parts[36]; + + const result = { + result_status: parts[0], + matching_index: parseInt(parts[1]), + primary_category: { + id: primaryId, + name: primaryName, + mapped_id: String(primaryMapped) + }, + secondary_category: { + id: secondaryId, + name: secondaryName, + mapped_id: secondaryMapped ? String(secondaryMapped) : null + }, + security_category: { + id: securityId, + name: securityName, + mapped_id: securityMapped ? String(securityMapped) : null + }, + reputation: { + id: reputationId, + name: reputationName + }, + matching_flag: { + value: matchingFlagValue, + names: matchingFlagNames + }, + age_rating: { + id: ageRatingId, + name: ageRatingName + }, + category_groups: categoryGroups, + volume_index: volumeIndex, + submitted_url: submittedUrl + }; + + // Add result array in the same format as / endpoint + const resultArray = [result.primary_category.mapped_id]; + if (result.secondary_category.mapped_id !== null) { + resultArray.push(result.secondary_category.mapped_id); + } + result.result = resultArray; + + return result; + } +} + +module.exports = { ParseFullCategoryUseCase } diff --git a/src/use-cases/update-categories-use-case.js b/src/use-cases/update-categories-use-case.js index df40dfc..7855d14 100644 --- a/src/use-cases/update-categories-use-case.js +++ b/src/use-cases/update-categories-use-case.js @@ -1,20 +1,26 @@ -const { exec } = require("node:child_process") +const { exec } = require("node:child_process"); class UpdateCategoriesUseCase { execute() { - return new Promise((_, reject) => { - exec(`bin/gcf1dbmng.sh etc urldb_update`, { cwd: '/usr/local/gcf1' }, (error, stdout, stderr) => { - if (error) { - console.error(error); - reject(error); + exec(`bin/gcf1dbmng.sh etc urldb_download`, { cwd: '/usr/local/gcf1' }, (error, stdout, stderr) => { + if (error) { + console.error('Erro no primeiro comando:', error); + return; + } + + console.log('Saída do primeiro comando:', stdout); + + // Executa o segundo comando após o primeiro ter sido concluído + exec(`bin/gcf1dbmng.sh etc urldb_update`, { cwd: '/usr/local/gcf1' }, (error2, stdout2, stderr2) => { + if (error2) { + console.error('Erro no segundo comando:', error2); return; } - console.log(stdout) + console.log('Saída do segundo comando:', stdout2); }); }); } } -module.exports = { UpdateCategoriesUseCase } - +module.exports = { UpdateCategoriesUseCase }; \ No newline at end of file diff --git a/test-detailed.http b/test-detailed.http index a5553fa..5f10afe 100644 --- a/test-detailed.http +++ b/test-detailed.http @@ -1,14 +1,27 @@ @baseUrl = http://localhost:3333 -### Test detailed endpoint with Facebook -POST {{baseUrl}}/detailed +# ============================================ +# POST / - Basic Categorization Endpoint +# Returns: Simple result array of category IDs +# ============================================ + +### Basic endpoint - Facebook +POST {{baseUrl}}/ Content-Type: application/json { "fqdn": "facebook.com" } -### Test detailed endpoint with Google +### Basic endpoint - Google +POST {{baseUrl}}/ +Content-Type: application/json + +{ + "fqdn": "google.com" +} + +### Basic endpoint - TikTok POST {{baseUrl}}/ Content-Type: application/json @@ -16,15 +29,60 @@ Content-Type: application/json "fqdn": "tiktok.com" } -### Test detailed endpoint with Reddit -POST {{baseUrl}}/detailed +### Basic endpoint - YouTube +POST {{baseUrl}}/ +Content-Type: application/json + +{ + "fqdn": "youtube.com" +} + +### Basic endpoint - Reddit +POST {{baseUrl}}/ Content-Type: application/json { "fqdn": "reddit.com" } -### Test detailed endpoint with YouTube +### Basic endpoint - Twitter +POST {{baseUrl}}/ +Content-Type: application/json + +{ + "fqdn": "twitter.com" +} + +# ============================================ +# POST /detailed - Detailed Categorization +# Returns: Categories with names, reputation, age rating +# ============================================ + +### Detailed endpoint - Facebook +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "facebook.com" +} + +### Detailed endpoint - Google +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "google.com" +} + +### Detailed endpoint - TikTok +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "tiktok.com" +} + +### Detailed endpoint - YouTube POST {{baseUrl}}/detailed Content-Type: application/json @@ -32,7 +90,15 @@ Content-Type: application/json "fqdn": "youtube.com" } -### Test detailed endpoint with Twitter +### Detailed endpoint - Reddit +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "reddit.com" +} + +### Detailed endpoint - Twitter POST {{baseUrl}}/detailed Content-Type: application/json @@ -40,10 +106,83 @@ Content-Type: application/json "fqdn": "twitter.com" } -### Test simple endpoint with Facebook (original endpoint) -POST {{baseUrl}}/ +# ============================================ +# POST /full - Complete Raw Output +# Returns: All 35 fields from NetStar with category groups +# ============================================ + +### Full endpoint - Facebook +POST {{baseUrl}}/full Content-Type: application/json { "fqdn": "facebook.com" } + +### Full endpoint - Google +POST {{baseUrl}}/full +Content-Type: application/json + +{ + "fqdn": "google.com" +} + +### Full endpoint - TikTok +POST {{baseUrl}}/full +Content-Type: application/json + +{ + "fqdn": "tiktok.com" +} + +### Full endpoint - YouTube +POST {{baseUrl}}/full +Content-Type: application/json + +{ + "fqdn": "youtube.com" +} + +### Full endpoint - Reddit +POST {{baseUrl}}/full +Content-Type: application/json + +{ + "fqdn": "reddit.com" +} + +### Full endpoint - Twitter +POST {{baseUrl}}/full +Content-Type: application/json + +{ + "fqdn": "twitter.com" +} + +# ============================================ +# Edge cases and special domains +# ============================================ + +### Basic endpoint - Unknown domain +POST {{baseUrl}}/ +Content-Type: application/json + +{ + "fqdn": "unknowndomainexample12345.com" +} + +### Detailed endpoint - Localhost +POST {{baseUrl}}/detailed +Content-Type: application/json + +{ + "fqdn": "localhost" +} + +### Full endpoint - IP-like domain +POST {{baseUrl}}/full +Content-Type: application/json + +{ + "fqdn": "99999.incompass.netstar-inc.com" +}