cl1p.net - The internet clipboard
Login/Sign Up
cl1p.net/xyz
cl1p.net/xyz
Login/Sign Up
This cl1p will be deleted in in 8 days.
Copy
Document Indexing and Retrieval ● Implement an inverted index construction algorithm import nltk from collections import defaultdict from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import string # Download NLTK resources (only needed once) nltk.download('punkt') nltk.download('stopwords') class InvertedIndex: def __init__(self): self.index = defaultdict(set) # Dictionary to store index def preprocess(self, text): """ Tokenizes text, removes punctuation and stopwords """ tokens = word_tokenize(text.lower()) # Convert to lowercase and tokenize stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word.isalnum() and word not in stop_words] return tokens def add_document(self, doc_id, text): """ Adds a document to the inverted index """ words = self.preprocess(text) for word in words: self.index[word].add(doc_id) def search(self, query): """ Searches for documents containing the query word """ query = query.lower() return self.index.get(query, set()) def display_index(self): """ Displays the inverted index """ for word, doc_ids in self.index.items(): print(f"{word}: {sorted(doc_ids)}") # Example usage documents = { 1: "Machine learning is a method of data analysis.", 2: "Data mining involves machine learning techniques.", 3: "Retrieval of information is crucial in search engines.", 4: "Search engines use inverted indexing for fast retrieval." } # Create an inverted index index = InvertedIndex() # Add documents for doc_id, text in documents.items(): index.add_document(doc_id, text) # Display the index print("\nInverted Index:") index.display_index() # Search for a word query_word = "machine" print(f"\nDocuments containing '{query_word}':", index.search(query_word)) ● Build a simple document retrieval system using the constructed index. import nltk from collections import defaultdict from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import string # Download necessary resources (if not downloaded) nltk.download('punkt') nltk.download('stopwords') class InvertedIndex: def __init__(self): self.index = defaultdict(set) # Dictionary to store the inverted index def preprocess(self, text): """Tokenizes text, removes punctuation and stopwords""" tokens = word_tokenize(text.lower()) # Convert to lowercase and tokenize stop_words = set(stopwords.words('english')) # Get stopwords tokens = [word for word in tokens if word.isalnum() and word not in stop_words] return tokens def add_document(self, doc_id, text): """Adds a document to the inverted index""" words = self.preprocess(text) for word in words: self.index[word].add(doc_id) def search(self, query): """Search for a word in the index and return relevant document IDs""" query = query.lower() return self.index.get(query, set()) def display_index(self): """Displays the inverted index""" for word, doc_ids in self.index.items(): print(f"{word}: {sorted(doc_ids)}") # Sample Documents documents = { 1: "Machine learning is a subset of artificial intelligence.", 2: "Data mining involves machine learning techniques.", 3: "Information retrieval helps find relevant documents.", 4: "Search engines use an inverted index for fast retrieval." } # Create an inverted index index = InvertedIndex() # Add documents to the index for doc_id, text in documents.items(): index.add_document(doc_id, text) # Display the inverted index print("\nInverted Index:") index.display_index() # User Search while True: query_word = input("\nEnter a word to search (or type 'exit' to quit): ").strip() if query_word.lower() == "exit": break result = index.search(query_word) if result: print(f"Documents containing '{query_word}': {result}") else: print(f"No documents found for '{query_word}'") PRACTICAL NO 2 ● Implement the Boolean retrieval model and process queries import nltk from collections import defaultdict from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import string # Download necessary NLTK resources nltk.download('punkt') nltk.download('stopwords') class BooleanRetrieval: def __init__(self): self.index = defaultdict(set) # Dictionary to store the inverted index self.all_docs = set() # Track all document IDs def preprocess(self, text): """Tokenizes text, removes punctuation and stopwords""" tokens = word_tokenize(text.lower()) # Convert to lowercase and tokenize stop_words = set(stopwords.words('english')) # Get stopwords tokens = [word for word in tokens if word.isalnum() and word not in stop_words] return tokens def add_document(self, doc_id, text): """Adds a document to the inverted index""" self.all_docs.add(doc_id) words = self.preprocess(text) for word in words: self.index[word].add(doc_id) def boolean_query(self, query): """Processes Boolean queries with AND, OR, NOT""" terms = query.lower().split() result = set() i = 0 while i < len(terms): term = terms[i] if term == "and": i += 1 if i < len(terms): result &= self.index.get(terms[i], set()) elif term == "or": i += 1 if i < len(terms): result |= self.index.get(terms[i], set()) elif term == "not": i += 1 if i < len(terms): result = self.all_docs - self.index.get(terms[i], set()) else: result = self.index.get(term, set()) i += 1 return result def display_index(self): """Displays the inverted index""" for word, doc_ids in self.index.items(): print(f"{word}: {sorted(doc_ids)}") # Sample Documents documents = { 1: "Machine learning is a subset of artificial intelligence.", 2: "Data mining involves machine learning techniques.", 3: "Information retrieval helps find relevant documents.", 4: "Search engines use an inverted index for fast retrieval." } # Create Boolean Retrieval System boolean_retrieval = BooleanRetrieval() # Add documents to the index for doc_id, text in documents.items(): boolean_retrieval.add_document(doc_id, text) # Display the inverted index print("\nInverted Index:") boolean_retrieval.display_index() # User Query Processing while True: query = input("\nEnter a Boolean query (use AND, OR, NOT) or 'exit' to quit: ").strip() if query.lower() == "exit": break result = boolean_retrieval.boolean_query(query) if result: print(f"Documents matching '{query}': {sorted(result)}") else: print(f"No documents found for '{query}'") ● Implement the vector space model with TF-IDF weighting and cosine similarity import nltk import numpy as np import string from collections import defaultdict from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from sklearn.metrics.pairwise import cosine_similarity # Download necessary resources nltk.download('punkt') nltk.download('stopwords') class VectorSpaceModel: def __init__(self, documents): self.documents = documents self.vectorizer = TfidfVectorizer(tokenizer=self.preprocess) self.tfidf_matrix = self.vectorizer.fit_transform(documents) self.terms = self.vectorizer.get_feature_names_out() def preprocess(self, text): """Tokenizes text, removes punctuation and stopwords""" tokens = word_tokenize(text.lower()) # Convert to lowercase and tokenize stop_words = set(stopwords.words('english')) # Load stopwords tokens = [word for word in tokens if word.isalnum() and word not in stop_words] return tokens def search(self, query): """Computes cosine similarity between query and document vectors""" query_tfidf = self.vectorizer.transform([query]) # Transform query to TF-IDF similarities = cosine_similarity(query_tfidf, self.tfidf_matrix)[0] # Rank documents by similarity score ranked_docs = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True) return ranked_docs # Sample Documents documents = [ "Machine learning is a subset of artificial intelligence.", "Data mining involves machine learning techniques.", "Information retrieval helps find relevant documents.", "Search engines use an inverted index for fast retrieval." ] # Create Vector Space Model vsm = VectorSpaceModel(documents) # User Query Processing while True: query = input("\nEnter a search query (or 'exit' to quit): ").strip() if query.lower() == "exit": break ranked_results = vsm.search(query) print("\nRanked Documents by Relevance:") for doc_id, score in ranked_results: print(f"Document {doc_id + 1} (Score: {score:.4f}): {documents[doc_id]}") PRACTICSL 3 ● Develop a spelling correction module using edit distance algorithms. import nltk from nltk.corpus import words from nltk.metrics.distance import edit_distance # Download NLTK word dictionary (if not already downloaded) nltk.download('words') class SpellingCorrector: def __init__(self): self.word_list = set(words.words()) # Load English dictionary words def correct_spelling(self, input_word): """Find the closest word from the dictionary using edit distance""" input_word = input_word.lower() closest_match = min(self.word_list, key=lambda word: edit_distance(input_word, word)) return closest_match # Create Spelling Corrector corrector = SpellingCorrector() # User Input while True: word = input("\nEnter a word to check spelling (or 'exit' to quit): ").strip() if word.lower() == "exit": break corrected_word = corrector.correct_spelling(word) if word.lower() == corrected_word: print(f"'{word}' is correctly spelled.") else: print(f"Did you mean '{corrected_word}'?") ● Integrate the spelling correction module into an information retrieval system. from spellchecker import SpellChecker import re class SpellingCorrectionIR: def __init__(self): # Initialize the spell checker self.spell = SpellChecker() def correct_spelling(self, query): """ Correct the spelling of each word in the user's query. """ # Tokenize the query (split by spaces) words = query.split() # Correct each word in the query corrected_words = [self.spell.correction(word) for word in words] # Reconstruct the query with corrected words corrected_query = " ".join(corrected_words) return corrected_query def preprocess_query(self, query): """ Preprocess the query, correct spelling errors, and prepare for IR. """ # Apply basic cleaning (removes special characters) cleaned_query = re.sub(r'[^A-Za-z0-9 ]+', '', query) # Correct the spelling of the query corrected_query = self.correct_spelling(cleaned_query) return corrected_query def search_documents(self, query): """ Simulate a simple document search. In an actual IR system, this would query a database or index. """ # Example documents (in practice, use a real index or document store) documents = { 1: "Information retrieval systems are important.", 2: "Spell checking is a necessary feature in search engines.", 3: "Learn about natural language processing and machine learning.", 4: "Search engines often use spelling correction to improve accuracy." } # Preprocess the query before searching processed_query = self.preprocess_query(query) # Simple search by checking if query words appear in documents relevant_docs = [] for doc_id, doc_text in documents.items(): if all(word.lower() in doc_text.lower() for word in processed_query.split()): relevant_docs.append(doc_id) return relevant_docs, processed_query # Example usage if __name__ == "__main__": ir_system = SpellingCorrectionIR() # Simulate a user's search query user_query = "infomration retrival systmes are important" print(f"Original Query: {user_query}") # Search for relevant documents relevant_docs, corrected_query = ir_system.search_documents(user_query) print(f"Corrected Query: {corrected_query}") print(f"Relevant Document IDs: {relevant_docs}") PRACTICAL NO 4 ● Calculate precision, recall, and F-measure for a given set of retrieval results. def calculate_metrics(relevant_retrieved, total_retrieved, total_relevant): # Precision = relevant_retrieved / total_retrieved precision = relevant_retrieved / total_retrieved if total_retrieved != 0 else 0 # Recall = relevant_retrieved / total_relevant recall = relevant_retrieved / total_relevant if total_relevant != 0 else 0 # F-measure = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0: f_measure = 2 * (precision * recall) / (precision + recall) else: f_measure = 0 return precision, recall, f_measure # Example values relevant_retrieved = 4 # Number of relevant documents retrieved total_retrieved = 6 # Total number of documents retrieved total_relevant = 10 collection # Total number of relevant documents in the # Calculate the metrics precision, recall, f_measure = calculate_metrics(relevant_retrieved, total_retrieved, total_relevant) # Output the results print(f"Precision: {precision:.4f}") print(f"Recall: {recall:.4f}") print(f"F-measure: {f_measure:.4f}") B)use an evaluation toolkit to measure average precision and other evaluation metrics. from sklearn.metrics import average_precision_score, precision_score, recall_score import numpy as np def evaluate_ir_system(retrieved_scores, relevant_labels): """ Evaluate the IR system using precision, recall, and average precision. retrieved_scores: List of predicted relevance scores from the IR system relevant_labels: List of actual relevance labels (0 for irrelevant, 1 for relevant) """ # Precision at k is calculated by scikit-learn as well, but we focus on Average Precision (AP) # We need the relevant documents and retrieved documents' scores for Average Precision # Calculate Average Precision (AP) score ap_score = average_precision_score(relevant_labels, retrieved_scores) # Calculate Precision and Recall at a fixed cut-off precision = precision_score(relevant_labels, [1 if score >= 0.5 else 0 for score in retrieved_scores]) recall = recall_score(relevant_labels, [1 if score >= 0.5 else 0 for score in retrieved_scores]) return ap_score, precision, recall # Example data retrieved_scores = [0.9, 0.8, 0.7, 0.4, 0.3, 0.1] # Predicted relevance scores (from IR system) relevant_labels = [1, 1, 0, 1, 0, 0] # Actual relevance labels (1 = relevant, 0 = irrelevant) # Evaluate the system ap, precision, recall = evaluate_ir_system(retrieved_scores, relevant_labels) # Output the results print(f"Average Precision (AP): {ap:.4f}") print(f"Precision: {precision:.4f}") print(f"Recall: {recall:.4f}") PRACTICAL NO 5 implement a text classification algorithm(e.g, naive bayes or support vector machine) import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.metrics import accuracy_score, classification_report # Sample text data and their labels data = { "text": [ "I love this phone", "This movie is amazing", "The product is terrible", "I hate the food", "Such a great experience", "Worst movie ever", "I am very satisfied with this purchase", "I will never buy this again" ], "label": ["positive", "positive", "negative", "negative", "positive", "negative", "positive", "negative"] } # Create a pandas DataFrame df = pd.DataFrame(data) # Vectorize the text using TF-IDF vectorizer = TfidfVectorizer(stop_words='english') # Split data into train and test X = vectorizer.fit_transform(df['text']) y = df['label'] # Train-test split (80% train, 20% test) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Naive Bayes Classifier nb_classifier = MultinomialNB() nb_classifier.fit(X_train, y_train) y_pred_nb = nb_classifier.predict(X_test) # SVM Classifier svm_classifier = SVC(kernel='linear') # Linear kernel for text classification svm_classifier.fit(X_train, y_train) y_pred_svm = svm_classifier.predict(X_test) # Evaluate Naive Bayes print("Naive Bayes Classifier Results:") print(f"Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}") print("Classification Report:") print(classification_report(y_test, y_pred_nb)) # Evaluate SVM print("SVM Classifier Results:") print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}") print("Classification Report:") print(classification_report(y_test, y_pred_svm)) PRACTICAL NO 6 A)implement a clustring alorithm(e.g,k-means or hierarchical clustring) import numpy as np import pandas as pd from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import matplotlib.pyplot as plt from sklearn.decomposition import TruncatedSVD # Step 1: Load the 20 Newsgroups dataset newsgroups = fetch_20newsgroups(subset='all') # Load the complete dataset X = newsgroups.data # Text data (documents) # Step 2: Vectorize the text using TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer = TfidfVectorizer(stop_words='english') # Remove common English stopwords X_tfidf = vectorizer.fit_transform(X) # Transform the text data into TF-IDF features # Step 3: Apply K-means Clustering k = 5 # Choose number of clusters (k) kmeans = KMeans(n_clusters=k, random_state=42) kmeans.fit(X_tfidf) # Fit the model # Step 4: Assign each document to a cluster y_kmeans = kmeans.predict(X_tfidf) # Predicted cluster labels for each document # Step 5: Inspect the results # Print the top terms per cluster to understand the key topics in each cluster print("\nTop terms per cluster:") terms = vectorizer.get_feature_names_out() for i in range(k): print(f"\nCluster {i+1}:") order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] top_terms = [terms[ind] for ind in order_centroids[i, :10]] # Top 10 terms for the cluster print("Top terms:", top_terms) # Step 6: Visualize the clusters (optional - reduces the dimensionality to 2D using PCA) pca = PCA(n_components=2) # Reduce the TF-IDF matrix to 2D X_pca = pca.fit_transform(X_tfidf.toarray()) # Apply PCA plt.figure(figsize=(8, 6)) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_kmeans, cmap='viridis', marker='o') plt.title("K-means Clustering of 20 Newsgroups") plt.xlabel("Principal Component 1") plt.ylabel("Principal Component 2") plt.colorbar(label='Cluster ID') plt.show() # Step 7: Examine some sample documents from each cluster print("\nSample documents from each cluster:") for i in range(k): print(f"\nCluster {i+1} Sample Document(s):") sample_idx = np.where(y_kmeans == i)[0] for j in range(2): # Show 2 sample documents per cluster print(f"Document {j+1}: {X[sample_idx[j]]}\n") B)Apply the clustering algorithm to a set of document and evaluate the clustering results. import numpy as np import pandas as pd from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score, v_measure_score import matplotlib.pyplot as plt from sklearn.decomposition import TruncatedSVD # Step 1: Load the 20 Newsgroups dataset newsgroups = fetch_20newsgroups(subset='all') # Load the complete dataset (train + test) X = newsgroups.data # Text data (documents) y = newsgroups.target # True labels for the newsgroups (not used directly for clustering) # Step 2: Vectorize the text using TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer = TfidfVectorizer(stop_words='english') # Remove common English stop words X_tfidf = vectorizer.fit_transform(X) # Transform the text data into TF-IDF features # Step 3: Apply K-means Clustering k = 5 # Choose the number of clusters (k) kmeans = KMeans(n_clusters=k, random_state=42) kmeans.fit(X_tfidf) # Fit the model # Step 4: Predict the cluster for each document y_kmeans = kmeans.predict(X_tfidf) # Predicted cluster labels for each document # Step 5: Evaluate the clustering results # Silhouette Score sil_score = silhouette_score(X_tfidf, y_kmeans) print(f"Silhouette Score: {sil_score:.4f}") # Homogeneity, Completeness, and V-Measure homogeneity = homogeneity_score(y, y_kmeans) completeness = completeness_score(y, y_kmeans) v_measure = v_measure_score(y, y_kmeans) print(f"Homogeneity Score: {homogeneity:.4f}") print(f"Completeness Score: {completeness:.4f}") print(f"V-Measure Score: {v_measure:.4f}") # Step 6: Inspect the results (Top terms per cluster) print("\nTop terms per cluster:") terms = vectorizer.get_feature_names_out() for i in range(k): print(f"\nCluster {i+1}:") order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] top_terms = [terms[ind] for ind in order_centroids[i, :10]] # Top 10 terms for the cluster print("Top terms:", top_terms) # Step 7: Visualize the clusters (using Truncated SVD for dimensionality reduction) svd = TruncatedSVD(n_components=2, random_state=42) # Reduce to 2D for visualization X_svd = svd.fit_transform(X_tfidf) # Apply Truncated SVD # Plotting the clusters plt.figure(figsize=(8, 6)) plt.scatter(X_svd[:, 0], X_svd[:, 1], c=y_kmeans, cmap='viridis', marker='o') plt.title("K-means Clustering of 20 Newsgroups (Truncated SVD)") plt.xlabel("Component 1") plt.ylabel("Component 2") plt.colorbar(label='Cluster ID') plt.show() # Step 8: Examine some sample documents from each cluster print("\nSample documents from each cluster:") for i in range(k): print(f"\nCluster {i+1} Sample Document(s):") sample_idx = np.where(y_kmeans == i)[0] for j in range(2): # Show 2 sample documents per cluster print(f"Document {j+1}: {X[sample_idx[j]]}\n") PRACTICAL NO 7 A)Develop a wed crawler to fetch and index wed pages. import requests from bs4 import BeautifulSoup import nltk from urllib.parse import urlparse, urljoin import sqlite3 import time import re # Initialize NLTK resources (e.g., stopwords) nltk.download('stopwords') from nltk.corpus import stopwords # Step 1: Set up the SQLite database for storing the index def create_db(): conn = sqlite3.connect('web_index.db') c = conn.cursor() # Create a table for storing URLs and their contents c.execute('''CREATE TABLE IF NOT EXISTS pages (url TEXT PRIMARY KEY, content TEXT)''') # Create a table for storing the inverted index (keyword -> URL mapping) c.execute('''CREATE TABLE IF NOT EXISTS index_table (keyword TEXT, url TEXT)''') conn.commit() return conn, c # Step 2: Crawl and fetch content from a web page def fetch_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text else: return None except Exception as e: print(f"Error fetching {url}: {e}") return None # Step 3: Parse the HTML and extract meaningful content def parse_page(url, html): soup = BeautifulSoup(html, 'html.parser') # Get the page content, removing unnecessary tags paragraphs = soup.find_all('p') content = ' '.join([p.get_text() for p in paragraphs]) # Clean the content (remove extra spaces, special characters) content = re.sub(r'\s+', ' ', content) # Store the page in the database return content # Step 4: Tokenize and index the content of the page def index_page(url, content, conn, c): # Tokenize the content and remove stopwords stop_words = set(stopwords.words('english')) words = re.findall(r'\w+', content.lower()) # Extract words and convert to lowercase words = [word for word in words if word not in stop_words] # Insert the page content into the 'pages' table c.execute("INSERT OR REPLACE INTO pages (url, content) VALUES (?, ?)", (url, content)) # Insert the inverted index (keyword -> URL mapping) for word in set(words): # Avoid inserting the same word more than once c.execute("INSERT OR REPLACE INTO index_table (keyword, url) VALUES (?, ?)", (word, url)) conn.commit() # Step 5: Get the links from a page (to crawl further) def get_links(url, html): soup = BeautifulSoup(html, 'html.parser') links = soup.find_all('a', href=True) absolute_links = set() for link in links: link_url = urljoin(url, link['href']) if urlparse(link_url).netloc == urlparse(url).netloc: # Only internal links absolute_links.add(link_url) return absolute_links # Step 6: Crawl the web and index pages def crawl_and_index(start_url, max_depth=2): conn, c = create_db() visited = set() # Set to track visited URLs to_visit = [(start_url, 0)] # Stack of URLs with their depth level while to_visit: url, depth = to_visit.pop() if url in visited or depth > max_depth: continue visited.add(url) print(f"Crawling: {url} (Depth: {depth})") html = fetch_page(url) if html: content = parse_page(url, html) index_page(url, content, conn, c) # Get the links on the page and add them to the crawl queue links = get_links(url, html) for link in links: if link not in visited: to_visit.append((link, depth + 1)) time.sleep(1) # To avoid making too many requests in a short time conn.close() # Step 7: Query the index def search(keyword, conn): c = conn.cursor() c.execute("SELECT url FROM index_table WHERE keyword=?", (keyword,)) rows = c.fetchall() if rows: print(f"Results for '{keyword}':") for row in rows: print(row[0]) else: print(f"No results found for '{keyword}'.") # Example Usage: if __name__ == "__main__": start_url = "https://en.wikipedia.org/wiki/Web_scraping" # Starting point for the crawl crawl_and_index(start_url, max_depth=2) # Search the index for a keyword conn = sqlite3.connect('web_index.db') search("scraping", conn) conn.close() b) Handle challenges such as robots,txt,dynamic,context,and crawling delays. import requests from bs4 import BeautifulSoup import urllib.robotparser from selenium import webdriver from selenium.webdriver.chrome.options import Options import time import random from urllib.parse import urlparse, urljoin # Respect robots.txt def can_crawl(url): parsed_url = urlparse(url) robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" rp = urllib.robotparser.RobotFileParser() rp.set_url(robots_url) rp.read() return rp.can_fetch("*", url) # Fetch static content def fetch_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text else: return None except Exception as e: print(f"Error fetching {url}: {e}") return None # Fetch dynamic content using Selenium def fetch_dynamic_content(url): options = Options() options.headless = True driver = webdriver.Chrome(executable_path='/path/to/chromedriver', options=options) driver.get(url) time.sleep(3) html = driver.page_source driver.quit() return html # Parse and extract content from HTML def parse_page(html): soup = BeautifulSoup(html, 'html.parser') paragraphs = soup.find_all('p') content = ' '.join([p.get_text() for p in paragraphs]) return content # Add random delays def crawl_with_delay(urls): for url in urls: if can_crawl(url): # Check robots.txt print(f"Crawling {url}") html = fetch_page(url) if 'javascript' not in url else fetch_dynamic_content(url) if html: content = parse_page(html) print(f"Content from {url}: {content[:100]}...") # Show the first 100 characters else: print(f"Failed to retrieve {url}") else: print(f"Skipping {url} due to robots.txt") # Simulate delay between requests delay = random.uniform(1, 3) print(f"Sleeping for {delay:.2f} seconds...\n") time.sleep(delay) # Example usage urls_to_crawl = [ "https://example.com/page1", "https://example.com/page2", "https://en.wikipedia.org/wiki/Web_scraping" ] crawl_with_delay(urls_to_crawl) Practical no 8 A) Implement the pagerank algorithm to rank wed pages based on link Analysis import numpy as np # Function to compute PageRank def page_rank(links, d=0.85, max_iter=100, tol=1.0e-6): """ links: adjacency matrix where links[i][j] is 1 if page i links to page j, otherwise 0 d: damping factor, typically 0.85 max_iter: maximum number of iterations tol: convergence tolerance (when the difference between iterations is smaller than this, we stop) Returns: PageRank values for each page """ N = len(links) # Create the transition matrix M = np.zeros((N, N)) # Create the transition matrix based on the link structure for i in range(N): out_links = np.sum(links[i]) if out_links > 0: M[i] = links[i] / out_links # Initialize the PageRank values (equal for all pages initially) pr = np.ones(N) / N # Iteratively compute PageRank for _ in range(max_iter): pr_new = (1 - d) / N + d * M.T.dot(pr) # Apply the PageRank formula # Check convergence (if the PageRank values change very little) if np.linalg.norm(pr_new - pr, 1) < tol: break pr = pr_new return pr # Example usage: if __name__ == "__main__": # Example: 4 pages with links # links[i][j] is 1 if page i links to page j # 0 -> 1 # 1 -> 2 # 2 -> 0, 1 # 3 -> 0 links = np.array([ [0, 1, 0, 0], # Page 0 links to Page 1 [0, 0, 1, 0], # Page 1 links to Page 2 [1, 1, 0, 0], # Page 2 links to Page 0, Page 1 [1, 0, 0, 0] # Page 3 links to Page 0 ]) # Calculate PageRank pr_values = page_rank(links) print("PageRank values:") for i, pr in enumerate(pr_values): print(f"Page {i}: {pr:.4f}") B) Apply the pagerank algorithm to a small web graph and analyze the result. import numpy as np # Function to compute PageRank def page_rank(links, d=0.85, max_iter=100, tol=1.0e-6): N = len(links) # Number of pages M = np.zeros((N, N)) # Create the transition matrix M based on the links for i in range(N): out_links = np.sum(links[i]) if out_links > 0: M[i] = links[i] / out_links # Initialize PageRank with equal values pr = np.ones(N) / N # Iteratively calculate PageRank for _ in range(max_iter): pr_new = (1 - d) / N + d * M.T.dot(pr) # Check convergence if np.linalg.norm(pr_new - pr, 1) < tol: break pr = pr_new return pr # Example web graph with 5 pages and their links links = np.array([ [0, 1, 1, 0, 0], # P0 links to P1 and P2 [0, 0, 1, 1, 0], # P1 links to P2 and P3 [1, 0, 0, 0, 1], # P2 links to P0 and P4 [0, 0, 0, 0, 1], # P3 links to P4 [1, 0, 0, 0, 0] # P4 links to P0 ]) # Apply PageRank algorithm pr_values = page_rank(links) # Print results print("PageRank values:") for i, pr in enumerate(pr_values): print(f"Page P{i}: {pr:.4f}") practical no 9 Implement a learning to rank algorithm(ranksvm,rankboost) import numpy as np from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import xgboost as xgb from sklearn.metrics import ndcg_score # ======= RankSVM (Support Vector Machine for Ranking) ======= # # Generate synthetic dataset for ranking # X: feature matrix (100 samples, 5 features) # y: relevance scores (0-3) for each document X, y = np.random.rand(100, 5), np.random.randint(0, 4, size=100) # RankSVM: SVC model for ranking (pairwise approach) svm = SVC(kernel='linear', C=1.0, probability=True) # Split data into training and testing (80% training, 20% testing) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train the model svm.fit(X_train, y_train) # Predict rankings for test data y_pred = svm.predict(X_test) # Calculate Mean Squared Error as a performance metric mse = mean_squared_error(y_test, y_pred) print("RankSVM - Mean Squared Error:", mse) # ======= RankBoost (using XGBoost for Ranking) ======= # # XGBoost requires data in DMatrix format dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) # Parameters for RankBoost (XGBoost with ranking objective) params = { 'objective': 'rank:pairwise', # Ranking objective 'eval_metric': 'ndcg', # NDCG as evaluation metric 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100 } # Learning rate # Maximum depth of trees # Number of boosting rounds # Train RankBoost model using XGBoost rank_boost_model = xgb.train(params, dtrain, num_boost_round=100) # Predict rankings for test data using the trained model y_pred_rankboost = rank_boost_model.predict(dtest) # Print predicted rankings print("RankBoost (XGBoost) Predicted Rankings:", y_pred_rankboost) # ======= Evaluation (NDCG Score for Ranking Performance) ======= # # Calculate the Normalized Discounted Cumulative Gain (NDCG) for evaluation # NDCG is often used for ranking tasks to measure the quality of the ranking. ndcg = ndcg_score([y_test], [y_pred_rankboost]) print(f"RankBoost (XGBoost) - NDCG Score: {ndcg:.4f}") B) train the ranking model using labelled data and evaluate its effectiveness. import numpy as np import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import ndcg_score from sklearn.datasets import make_classification # ======================== # Step 1: Prepare a Synthetic Dataset for Ranking # ======================== # Create a synthetic dataset # Features (100 samples, 5 features) X = np.random.rand(100, 5) # Relevance labels (0: irrelevant, 1: somewhat relevant, 2: highly relevant) y = np.random.randint(0, 3, size=100) # Query group (for each query, there are multiple documents) # We will simulate that all documents belong to the same query group query_group = np.ones(100) # ======================== # Step 2: Train the RankBoost Model using XGBoost # ======================== # Convert to XGBoost DMatrix format dtrain = xgb.DMatrix(X, label=y) dtrain.set_group(query_group) # Set query group for ranking # Parameters for XGBoost (RankBoost) params = { 'objective': 'rank:pairwise', # Ranking objective 'eval_metric': 'ndcg', # Evaluate using NDCG 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100 } # Learning rate # Maximum depth of trees # Number of boosting rounds # Train the model rank_boost_model = xgb.train(params, dtrain, num_boost_round=100) # ======================== # Step 3: Evaluate the Model # ======================== # Split data for testing (using 80% training, 20% testing) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) query_group_test = np.ones(X_test.shape[0]) # Convert to DMatrix for testing dtest = xgb.DMatrix(X_test, label=y_test) dtest.set_group(query_group_test) # Predict rankings for the test set y_pred_rankboost = rank_boost_model.predict(dtest) # Print predicted rankings print("Predicted Rankings (RankBoost):", y_pred_rankboost) # ======================== # Step 4: Evaluate Effectiveness Using NDCG # ======================== # Calculate the Normalized Discounted Cumulative Gain (NDCG) # NDCG measures how well the ranking matches the true relevance. ndcg = ndcg_score([y_test], [y_pred_rankboost]) print(f"RankBoost (XGBoost) - NDCG Score: {ndcg:.4f}") Practical no 10 A) Implements a text summarization algorithm(eg,extractive or adstractive) import nltk import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords # Download necessary NLTK datasets nltk.download('punkt') nltk.download('stopwords') # Preprocess the text (tokenize, remove stopwords, etc.) def preprocess_text(text): stop_words = set(stopwords.words('english')) words = word_tokenize(text.lower()) # Tokenize and convert to lower case filtered_words = [word for word in words if word.isalpha() and word not in stop_words] return ' '.join(filtered_words) # Function to summarize the text using Extractive Summarization def extractive_summary(text, num_sentences=3): # Step 1: Tokenize the text into sentences sentences = sent_tokenize(text) # Step 2: Preprocess the sentences preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences] # Step 3: Compute TF-IDF scores for the sentences tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences) # Step 4: Compute cosine similarity between each pair of sentences cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix) # Step 5: Rank sentences by their importance (using a ranking score) sentence_scores = cosine_sim_matrix.sum(axis=1) # Step 6: Select the top 'num_sentences' sentences based on their scores ranked_sentences = [sentences[i] for i in sentence_scores.argsort()[-num_sentences:]] return ' '.join(ranked_sentences) # Sample text for summarization text = """ Machine learning is a field of artificial intelligence that uses statistical techniques to give computer systems the ability to learn from data, without being explicitly programmed. The process of machine learning involves feeding large amounts of data to a computer, which then processes and analyzes this data to identify patterns. There are several types of machine learning, including supervised learning, unsupervised learning, and reinforcement learning. In supervised learning, the algorithm is trained on a labeled dataset, where the desired output is already known. In unsupervised learning, the algorithm works with unlabeled data and tries to identify patterns or groupings. Reinforcement learning is based on an agent that interacts with its environment and learns to take actions based on feedback from its interactions. """ # Get the extractive summary summary = extractive_summary(text, num_sentences=3) print("Extractive Summary:") print(summary) B) Build a question-answering system using techniques such as information extraction. import spacy from spacy.matcher import Matcher from collections import Counter # Load spaCy's pre-trained English model nlp = spacy.load("en_core_web_sm") # Sample document (could be a passage from a book, article, etc.) document = """ Albert Einstein was a German-born theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics. His work is also known for its influence on the philosophy of science. He was born on March 14, 1879, in Ulm, Kingdom of Württemberg in the German Empire. Einstein received the Nobel Prize in Physics in 1921 for his discovery of the photoelectric effect. """ # Define a function to extract named entities and related information def extract_entities(doc): entities = [] for ent in doc.ents: entities.append((ent.text, ent.label_)) return entities # Define a function for simple question-answering by matching entity types def simple_question_answering(query, doc): doc_nlp = nlp(doc) query_tokens = query.lower().split() if 'who' in query_tokens: # Looking for a person for ent in doc_nlp.ents: if ent.label_ == "PERSON": return f"The answer is: {ent.text}" if 'when' in query_tokens: # Looking for a date for ent in doc_nlp.ents: if ent.label_ == "DATE": return f"The answer is: {ent.text}" if 'where' in query_tokens: # Looking for a location for ent in doc_nlp.ents: if ent.label_ == "GPE": return f"The answer is: {ent.text}" return "Sorry, I couldn't find an answer." # Example Queries queries = [ "Who developed the theory of relativity?", "When was Albert Einstein born?", "Where was Einstein born?" ] # Testing the question-answering system for query in queries: print(f"Question: {query}") answer = simple_question_answering(query, document) print(f"Answer: {answer}") print("-" * 50)