This commit is contained in:
@@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
from src.scraper.exhaustive import detect_source_type, is_blacklisted, normalize_url
|
||||
from src.processor.processor import simple_chunk
|
||||
|
||||
|
||||
def test_detect_source_type():
|
||||
assert detect_source_type("https://youtube.com/watch?v=dQw4w9WgXcY") == "youtube"
|
||||
assert detect_source_type("https://reddit.com/r/test/comments/abc") == "reddit"
|
||||
assert detect_source_type("https://en.wikipedia.org/wiki/Roswell") == "wikipedia"
|
||||
assert detect_source_type("https://example.com/doc.pdf") == "pdf"
|
||||
assert detect_source_type("https://example.com/article") == "web"
|
||||
|
||||
|
||||
def test_is_blacklisted():
|
||||
assert is_blacklisted("https://facebook.com/something") == True
|
||||
assert is_blacklisted("https://en.wikipedia.org/wiki/Test") == False
|
||||
|
||||
|
||||
def test_normalize_url():
|
||||
assert normalize_url("https://example.com/page#section") == "https://example.com/page"
|
||||
assert normalize_url("https://example.com/page/") == "https://example.com/page"
|
||||
|
||||
|
||||
def test_simple_chunk():
|
||||
text = "\n\n".join([f"Paragraph {i} with some content here." for i in range(50)])
|
||||
chunks = simple_chunk(text, chunk_size=100, overlap=20)
|
||||
assert len(chunks) > 1
|
||||
assert all(isinstance(c, str) for c in chunks)
|
||||
Reference in New Issue
Block a user