const sessionPool_de = await SessionPool.open({
maxPoolSize: 25,
sessionOptions:{
maxAgeSecs: 10,
maxUsageCount: 150, // for example when you know that the site blocks after 150 requests.
},
persistStateKeyValueStoreId: 'main_session',
persistStateKey: 'location-specific-session-pool',
});
var session1 = await sessionPool_de.getSession();
var proxyurl = await proxyConfigurationEU.newUrl(session1.id);
console.log(proxyurl);
var cookies_g =[
{ name: 'cookie1', value: 'my-cookie' },
{ name: 'cookie2', value: 'your-cookie' }
]
console.log("orignal cookies ",cookies_g, "Session_main", session1.id);
session1.setCookies(
cookies_g,
'https://www.example.com'
);
console.log("getting cookies", session1.getCookies("www.example.com"));
session1.getCookies
always returns empty. Any solution for it ? how can I debug it further ? createSessionFunction: async(sessionPool,options) => {
const session1 = await sessionPool.getSession();
const proxyurl = await proxyConfigurationEU.newUrl(session1.id);
// Get cookies
session1.setCookiesFromResponse(response);
return session1
}
createSessionFunction: async(sessionPool,options) => {
var new_session = new Session({ sessionPool });
var proxyurl = await proxyConfigurationDE.newUrl(new_session.id);
console.log(proxyurl, new_session.id);
var cookies_g = await getCookiesDE(proxyurl,'https://www.example.com');
console.log("cookies from playwright..",cookies_g);
new_session.setCookies(
cookies_g,
'https://www.example.com'
);
console.log("Checking cookies set..", new_session.getCookieString("example.com"))
return new_session;
}
const crawler = new CheerioCrawler({
proxyConfiguration,
requestQueue: queue,
useSessionPool: true,
persistCookiesPerSession: true,
maxRequestRetries: 20,
maxRequestsPerMinute: 250,
autoscaledPoolOptions:{
maxConcurrency:100,
minConcurrency: 5,
isFinishedFunction: async () => {
// Tell the pool whether it should finish
// or wait for more tasks to become available.
// Return true or false
return false
}
},
failedRequestHandler: async (context) => rebirth_requests({ ...context}),
requestHandler: async (context) => router({ ...context, dbPool})
//sessionPoolOptions:{blockedStatusCodes:[]},
});
Note that RequestList can be used together with RequestQueue by the same crawler. In such cases, each request from RequestList is enqueued into RequestQueue first and then consumed from the latter. This is necessary to avoid the same URL being processed more than once (from the list first and then possibly from the queue). In practical terms, such a combination can be useful when there is a large number of initial URLs, but more URLs would be added dynamically by the crawler.
Request {
id: 'OBTRQI5zvA4aIJ9',
url: 'https://someapi.com',
loadedUrl: 'https://someapi.com',
uniqueKey: '22586062-3f0d-40be-b499-f1a00261b5d3',
method: 'GET',
payload: undefined,
noRetry: false,
retryCount: 0,
errorMessages: [],
headers: {},
userData: [Getter/Setter],
handledAt: undefined
}